Skip to content

Commit

Permalink
Merge branch 'devel' of github.com:OliverVoogd/FLAMES into devel
Browse files Browse the repository at this point in the history
  • Loading branch information
OliverVoogd committed Oct 16, 2023
2 parents b3fa6f0 + 337beeb commit f388098
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 12 deletions.
16 changes: 16 additions & 0 deletions R/find_isoform.R
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,22 @@ find_isoform_flames <- function(annotation, genome_fa, genome_bam, outdir, confi
#'
#' @export
annotation_to_fasta <- function(isoform_annotation, genome_fa, outdir, extract_fn) {
# check if all the transcript in the annotation is stranded
annotation_d <- read.csv(isoform_annotation, sep = "\t",
header = FALSE, stringsAsFactors = FALSE,
comment.char = "#")
strands <- annotation_d[,7]
if (any(strands == '.')) {
strands[strands == '.'] <- '+'
annotation_d[,7] <- strands
modified_gtf <- paste0(tempfile(),'/tmp.gtf')
dir.create(dirname(modified_gtf))
write.table(annotation_d, modified_gtf, sep = "\t",
row.names = FALSE, quote = FALSE, col.names = FALSE)
isoform_annotation <- modified_gtf
}
rm(annotation_d, strands)

out_file <- file.path(outdir, "transcript_assembly.fa")

dna_string_set <- Biostrings::readDNAStringSet(genome_fa)
Expand Down
27 changes: 16 additions & 11 deletions inst/python/count_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,6 @@ def get_read_to_gene_assignment(in_bam, gene_idx_df, methods):
positions_5prim, overlaps, read_lengths = [[] for i in range(9)]

# process genes in the gene_idx_df
# for gene in tqdm(gene_idx_df.itertuples(),
# total=gene_idx_df.shape[0],
# desc="Processing Genes", unit="gene"):

for gene in gene_idx_df.itertuples():
reads_fetch = bam_file.fetch(gene.chr_name, gene.start, gene.end)
# for each reads, get the reference_name, mapping position, strand
Expand Down Expand Up @@ -177,6 +173,20 @@ def flames_read_id_parser(read_id, methods = 'flexiplex'):
sys.exit("Please specify the correct methods: 'flexiplex' or 'blaze'")

def quantify_gene(in_bam, in_gtf, n_process):
"""A multi-process wrapper of quantify_gene_single_process
Processing strategy:
1. split the gtf file by chromosome
2. for each chromosome, run quantify_gene_single_process
3. combine the gene count matrix
Input:
in_bam: bam file path
in_gtf: gtf file path
n_process: number of process to run
return:
gene_count_mat: pd.DataFrame, a matrix of gene counts
dup_read_lst: a list of duplicated read id
umi_lst: a list of umi
"""
# identify the demultiplexing methods
bam_file = pysam.AlignmentFile(in_bam, "rb")
first_read_id = next(bam_file).query_name
Expand All @@ -188,7 +198,7 @@ def quantify_gene(in_bam, in_gtf, n_process):
chr_names = in_gtf_df.chr_name.unique()
in_gtf_iter = (in_gtf_df[in_gtf_df.chr_name == x] for x in chr_names)


# run quantify_gene_single_process for each chromosome
print("Assigning reads to genes...")
gene_count_mat_dfs, dup_read_lst, umi_lst = [], [], []
for future in helper.multiprocessing_submit(
Expand Down Expand Up @@ -216,15 +226,10 @@ def quantify_gene_single_process(in_gtf_df, in_bam, demulti_methods):
Input:
in_bam: bam file path
in_gtf_df: gtf_df contains the subset of gene to run in a single process
estimate_saturation: whether to estimate saturation curve
plot_saturation_curve: plot filename for saturation curve, if specified,
`estimate_saturation` is autimatically set to True
demulti_methods: demultiplexing methods, 'flexiplex' or 'blaze'
Output:
gene_count_mat: a matrix of gene counts
read_gene_assign_df: a dataframe of read to gene assignment with umi corrected
Todo:
1. separate the gene assignment df to 1. reads unambiguously assigned to gene 2. reads assigned to multiple gene
2. when do the UMI dedup before 1, but need to make sure a same read would not be counted multiple times
"""

read_gene_assign_df = \
Expand Down
27 changes: 26 additions & 1 deletion inst/python/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from tqdm import tqdm
import os
import sys
import cProfile
import pstats

def reverse_complement(seq):
'''
Expand Down Expand Up @@ -262,4 +264,27 @@ def read_chunk_generator(file_handle, chunck_size):
lines = []
i = 0
if len(lines):
yield lines
yield lines

def profile(fn):
def profiled_fn(*args, **kwargs):
profiler = cProfile.Profile()
profiler.enable()
result = fn(*args, **kwargs)
profiler.disable()
profiler.create_stats()

# Change the filename as needed
profile_filename = f"{fn.__name__}_profile.cprof"
profiler.dump_stats(profile_filename)

# Optionally, print the profiling results
print(f"Profiling results for {fn.__name__}:")
with open(profile_filename, "rb") as f:
stats = pstats.Stats(profiler, stream=f)
stats.strip_dirs()
stats.sort_stats("cumulative")
stats.print_stats()

return result
return profiled_fn

0 comments on commit f388098

Please sign in to comment.