Merge

mritchielab · Oct 13, 2023 · 337beeb · 337beeb
2 parents 224db35 + 6ce64d3
commit 337beeb
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 11 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -27,7 +27,6 @@ biocViews: RNASeq, SingleCell, Transcriptomics, DataImport,
 License: GPL (>= 2)
 Encoding: UTF-8
 Imports: 
-    arrangements,
     basilisk,
     bambu,
     Biostrings,

diff --git a/NAMESPACE b/NAMESPACE
@@ -84,7 +84,6 @@ importFrom(SummarizedExperiment,assays)
 importFrom(SummarizedExperiment,colData)
 importFrom(SummarizedExperiment,rowData)
 importFrom(SummarizedExperiment,rowRanges)
-importFrom(arrangements,combinations)
 importFrom(bambu,bambu)
 importFrom(bambu,prepareAnnotations)
 importFrom(bambu,writeToGTF)

diff --git a/R/find_isoform.R b/R/find_isoform.R
@@ -110,10 +110,14 @@ find_isoform_flames <- function(annotation, genome_fa, genome_bam, outdir, confi
 }
 
 #' GTF/GFF to FASTA conversion
-#' @description convert the transcript annotation to transcriptome assembly as FASTA file.
+#' @description convert the transcript annotation to transcriptome assembly as FASTA file. The
+#' genome annotation is first imported as TxDb object and then used to extract transcript sequence
+#' from the genome assembly.
 #' @param isoform_annotation Path to the annotation file (GTF/GFF3)
 #' @param genome_fa The file path to genome fasta file.
 #' @param outdir The path to directory to store the transcriptome as \code{transcript_assembly.fa}.
+#' @param extract_fn (optional) Function to extract \code{GRangesList} from the genome TxDb object.
+#' E.g. \code{function(txdb){GenomicFeatures::cdsBy(txdb, by="tx", use.names=TRUE)}}
 #' @return Path to the outputted transcriptome assembly
 #'
 #' @importFrom Biostrings readDNAStringSet writeXStringSet
@@ -125,7 +129,7 @@ find_isoform_flames <- function(annotation, genome_fa, genome_bam, outdir, confi
 #' cat(readChar(fasta, nchars = 1e3))
 #'
 #' @export
-annotation_to_fasta <- function(isoform_annotation, genome_fa, outdir) {
+annotation_to_fasta <- function(isoform_annotation, genome_fa, outdir, extract_fn) {
   # check if all the transcript in the annotation is stranded
   annotation_d <- read.csv(isoform_annotation, sep = "\t", 
                     header = FALSE, stringsAsFactors = FALSE, 
@@ -146,10 +150,16 @@ annotation_to_fasta <- function(isoform_annotation, genome_fa, outdir) {
 
   dna_string_set <- Biostrings::readDNAStringSet(genome_fa)
   names(dna_string_set) <- gsub(" .*$", "", names(dna_string_set))
-  txdb <- GenomicFeatures::makeTxDbFromGFF(isoform_annotation)
+  if (missing(extract_fn)) {
+    txdb <- GenomicFeatures::makeTxDbFromGFF(isoform_annotation)
+    tr_string_set <- GenomicFeatures::extractTranscriptSeqs(dna_string_set, txdb,
+      use.names = TRUE)
+  } else {
+    extracted_grl<- extract_fn(txdb)
+    tr_string_set <- GenomicFeatures::extractTranscriptSeqs(dna_string_set, extracted_grl)
+    # additional arguments are allowed only when 'transcripts' is not a GRangesList object
+  }
 
-  tr_string_set <- GenomicFeatures::extractTranscriptSeqs(dna_string_set, txdb,
-    use.names = TRUE)
   if (length(names(tr_string_set)) > length(unique(names(tr_string_set)))) {
     cat("Duplicated transcript IDs present, removing ...")
     tr_string_set <- tr_string_set[unique(names(tr_string_set))]

diff --git a/R/model_decay.R b/R/model_decay.R
@@ -8,7 +8,6 @@
 #' @importFrom S4Vectors split
 #' @importFrom GenomicRanges strand
 #' @importFrom BiocGenerics start end
-#' @importFrom arrangements combinations
 #' 
 #' @param annotation path to the GTF annotation file, or the parsed GenomicRanges
 #' object.

diff --git a/man/annotation_to_fasta.Rd b/man/annotation_to_fasta.Rd
diff --git a/src/flexiplex.cpp b/src/flexiplex.cpp
@@ -195,7 +195,7 @@ EdlibAlignConfig edlibConf = {flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH,
 
   std::vector<long unsigned int> subpattern_ends;
   subpattern_ends.resize(subpattern_lengths.size());
-  std::inclusive_scan(subpattern_lengths.begin(), subpattern_lengths.end(),
+  std::partial_sum(subpattern_lengths.begin(), subpattern_lengths.end(),
                       subpattern_ends.begin());
 
   std::vector<int> read_to_subpatterns;