merge conflicts

mritchielab · Oct 2, 2023 · 9220fcf · 9220fcf
2 parents fe1bbff + 9d7a609
commit 9220fcf
Show file tree

Hide file tree

Showing 9 changed files with 103 additions and 41 deletions.
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -238,7 +238,7 @@ jobs:
           rcmdcheck::rcmdcheck(
               args = c("--no-manual", "--no-vignettes", "--timings"),
               build_args = c("--no-manual", "--keep-empty-dirs", "--no-resave-data"),
-              error_on = "warning",
+              error_on = "error",
               check_dir = "check"
           )
         shell: Rscript {0}

diff --git a/R/basilisk.R b/R/basilisk.R
@@ -1,7 +1,7 @@
 #' @importFrom basilisk BasiliskEnvironment
 flames_env <- BasiliskEnvironment(
     envname = "flames_env", pkgname = "FLAMES",
-    pip = c("fast-edit-distance==1.2.1", "blaze2==2.0.0a7"),
+    pip = c("fast-edit-distance==1.2.1", "blaze2==2.0.0a8"),
     packages = c(
         "python==3.10",
         "numpy==1.25.0",

diff --git a/R/create_config.R b/R/create_config.R
@@ -45,8 +45,8 @@
 #' @importFrom utils modifyList
 #' @importFrom stats setNames
 #' @export
-create_config <- function(outdir, type = "sc_5end", ...) {
-    if (type == "sc_5end") {
+create_config <- function(outdir, type = "sc_3end", ...) {
+    if (type == "sc_3end") {
         config <- jsonlite::fromJSON(system.file("extdata/config_sclr_nanopore_3end.json", package = "FLAMES"))
     } else if (type == "SIRV") {
         config <- jsonlite::fromJSON(system.file("extdata/SIRV_config_default.json", package = "FLAMES"))

diff --git a/R/quantification.R b/R/quantification.R
@@ -96,7 +96,7 @@ wrt_tr_to_csv <-
 #' 
 #' @param annotation The file path to the annotation file in GFF3 format
 #' @param outdir The path to directory to store all output files.
-#' @param n_process The number of threads to use to process the alignment file
+#' @param n_process The number of processes to use for parallelization.
 #' @param pipeline The pipeline type as a character string, either \code{sc_single_sample} (single-cell, single-sample),
 #' \code{bulk} (bulk, single or multi-sample), or \code{sc_multi_sample} (single-cell, multiple samples)
 #' @return The count matrix will be saved in the output folder as \code{transcript_count.csv.gz}.

diff --git a/inst/python/count_tr.py b/inst/python/count_tr.py
@@ -29,45 +29,66 @@
 
 
 def umi_dedup(l, has_UMI, max_ed=1):
+    """
+    Deduplicate UMI by allowing edit distance <= max_ed
+    Input:
+        l: list of UMI
+        has_UMI: whether the input has UMI
+        max_ed: maximum edit distance allowed for UMI deduplication
+
+    Return:
+        tuple of (read count, dedup UMI count) 
+            Note: if no UMI, the dedup UMI count is the same as read count
+    """
+    read_cnt = len(l)
+
     if has_UMI:
-        read_cnt = len(l)
         dup_cnt = Counter(l)
         l_cnt = sorted(dup_cnt.most_common(), key=lambda x: (x[1], x[0]), reverse=True)
         if len(l_cnt) == 1:
-            return (),1
+            return 1, 1
+
+        # remove UMI with edit distance <= max_ed
         rm_umi = {}
         for ith in range(len(l_cnt)-1):
             for jth in range(len(l_cnt)-1, ith, -1):  # first assess the low abundant UMI
                 if l_cnt[jth][0] not in rm_umi:
-                    #if editdistance.eval(l_cnt[ith][0], l_cnt[jth][0]) < 2:
-                    if fast_edit_distance.edit_distance(l_cnt[ith][0],l_cnt[jth][0], max_ed) <= max_ed:
+                    if fast_edit_distance.edit_distance(
+                            l_cnt[ith][0],l_cnt[jth][0], max_ed) <= max_ed:
                         rm_umi[l_cnt[jth][0]] = 1
-                        l_cnt[ith] = (l_cnt[ith][0], l_cnt[ith][1]+l_cnt[ith][1])
+                        l_cnt[ith] = (l_cnt[ith][0], l_cnt[ith][1]+l_cnt[jth][1])
         # return read count, dedup UMI count
-        return tuple([x[1] for x in l_cnt]), len(l_cnt)-len(rm_umi)
+        return read_cnt, len(l_cnt)-len(rm_umi)
     else:
-        (), len(l)
+        return read_cnt, read_cnt
 
 
-def wrt_tr_to_csv(bc_tr_count_dict, transcript_dict, csv_f, transcript_dict_ref=None, has_UMI=False,
-                  print_saturation = False):
+def wrt_tr_to_csv(bc_tr_count_dict, transcript_dict, csv_f, 
+                    transcript_dict_ref=None, has_UMI=False,print_saturation=False):
+    """
+    Write transcript count to csv file
+    """
     f = gzip.open(csv_f, "wt")
     all_tr = set()
     for bc in bc_tr_count_dict:
         all_tr.update(list(bc_tr_count_dict[bc].keys()))
     all_tr = list(all_tr)
+
+    # write header
     f.write("transcript_id,gene_id," +
             ",".join([x for x in bc_tr_count_dict])+"\n")
+    for k, v in bc_tr_count_dict.items():
+        if v is None:
+            print (k,v)
     tr_cnt = {}
     dup_count = ()
     for tr in all_tr:
-        cnt_l = [umi_dedup(bc_tr_count_dict[x][tr], has_UMI)[1]
-                 if tr in bc_tr_count_dict[x] else 0 for x in bc_tr_count_dict]
-        tr_cnt[tr] = sum(cnt_l)
-        if has_UMI:
-            dup_count += \
-                sum([umi_dedup(bc_tr_count_dict[x][tr], has_UMI)[0]
-                    if tr in bc_tr_count_dict[x] else () for x in bc_tr_count_dict], ())
+        read_counts, dup_counts = zip(
+            *[umi_dedup(bc_tr_count_dict[bc][tr], has_UMI)
+                 if tr in bc_tr_count_dict[bc] else (0,0) for bc in bc_tr_count_dict]
+                 )
+
+        tr_cnt[tr] = sum(dup_counts)
 
         if tr in transcript_dict:
             f.write(
@@ -78,13 +99,13 @@ def wrt_tr_to_csv(bc_tr_count_dict, transcript_dict, csv_f, transcript_dict_ref=
         else:
             print("cannot find transcript in transcript_dict:", tr)
             exit(1)
-        f.write(",".join([str(x) for x in cnt_l])+"\n")
+        f.write(",".join([str(x) for x in dup_counts])+"\n")
     f.close()
-    if print_saturation and has_UMI:
-        helper.green_msg(f"The isoform quantification result generated:  {csv_f}.")
-        # remove the following saturation estimation because it's done in gene quantification part
-        if sum(dup_count):
-            helper.green_msg(f"The estimated saturation is {1-len(dup_count)/sum(dup_count)}")
+
+    # print saturation if requested
+    if print_saturation and has_UMI and sum(dup_counts):
+        helper.green_msg(f"The estimated saturation is {1-len(dup_count)/sum(dup_count)}")
+
     return tr_cnt
 
 

diff --git a/man/create_config.Rd b/man/create_config.Rd
diff --git a/man/quantify_gene.Rd b/man/quantify_gene.Rd
diff --git a/vignettes/FLAMES_vignette.Rmd b/vignettes/FLAMES_vignette.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "FLAMES"
-author: "Oliver Voogd, Changqing Wang"
+author: "Oliver Voogd, Changqing Wang, Yupei You"
 package: FLAMES
 output: 
   BiocStyle::html_document:
@@ -22,13 +22,21 @@ knitr::opts_chunk$set(
 
 The FLAMES package provides a framework for performing single-cell and bulk read full-length analysis of mutations and splicing. FLAMES performs cell barcode and UMI assignment from nanopore reads as well as semi-supervised isoform detection and quantification. FLAMES is designed to be an easy and quick to use, powerful workflow for isoform detection and quantification, splicing analysis and mutation detection, and seeks to overcome the limitations of other tools, such as an inability to process single cell data, and a focus on cell barcode and UMI assignment [@flames].
 
-This R package was created to simplify installation and execution of the FLAMES python module, which supports the article *Comprehensive characterization of single cell full-length isoforms in human and mouse with long-read sequencing* by Tian et al. [-@flames].
+This R package represents an enhanced iteration of the FLAMES Python module that originally designed to support the research work presented in *Comprehensive characterization of single-cell full-length isoforms in human and mouse with long-read sequencing* by Tian et al [-@flames]. This upgraded R version not only simplifies the installation and execution processes but also incorporates additional functionality, streamlining the analysis of single-cell full-length isoforms using long-read sequencing data. 
 
 ```{r workflow, echo=FALSE, fig.align='center', fig.cap='FLAMES pipeline workflow summary'}
 knitr::include_graphics(system.file("images/FLAMESpipeline-01.png", package = "FLAMES"))
 ```
 
-Input to FLAMES are fastq files generated from the long-read platform. Using a cell barcode allow-list (e.g. barcode list obtained from short-read sequencing), the `find_barcode()` function locates the barcode sequencing in the long-reads and trims the cell barcodes and the flanking UMI sequence. The pipeline then calls `minimap_align()`, a `minimap2` wrapper, to align the demultiplex long-reads to the reference genome. Next, `find_isoform()` is called to identify novel isoforms using the alignment, creating an updated transcript assembly. The demultiplexed reads are then aligned to the transcript assembly by the function `minimap_realign()`. Finally, FLAMES calls `quantify_transcript()` to quantify the transcript counts using the (re-)alignment file.
+When processing single cell data, FLAEMS can be run on data generated from long-read platform with or without matched short-read sequencing data. If only the long reads are available, FLAMES takes as input the long reads in fastq files. The `blaze()` function is then called to demultiplex the long reads into cell-specific fastq files. If the short-read data is available, FLAMES incorporates the [BLAZE](https://github.com/shimlab/BLAZE) as `blaze()` function to identify cell barcodes solely from long reads [@blaze]. The `blaze()` function is called to locate the barcode sequencing in the long-reads, identify barcode allow-list directly from long reads, assign reads to cells (i.e., demultiplexing) and trims the cell barcodes and the flanking UMI sequence. 
+
+When matched short-read single-cell RNA sequencing data is available, FLAMES could take the cell barcode allow-list generated from short reads in addition to the long-read fastq files. The short-read allow-list will used as a reference to guide the demultiplexing of the long reads. To do this, FLAMES incorporates the [flexiplex](https://davidsongroup.github.io/flexiplex/) as `find_barcode()` function to perform demultiplexing [@flexiplex]. The `find_barcode()` function is called to extract the cell barcode and UMI sequence from the long reads, using the allow-list as a reference.
+
+After the demultiplexing, the pipeline calls `minimap_align()`, a `minimap2` wrapper, to align the demultiplex long-reads to the reference genome. `quantify_gene()` is then called to deduplicate reads from same unique molecular identifiers (UMIs) and generate gene-level UMI counts. Note that during the UMI deduplication, the pipeline only keeps the longest reads among those with the same UMI for downstream steps.
+
+Next, `find_isoform()` is called to identify novel isoforms using the alignment, creating an updated transcript assembly. In this step, the users may choose to use [bambu](https://github.com/GoekeLab/bambu), which is designed for transcript discovery and quantification using long read RNA-Seq data [@bambu]. Otherwise, users could use the build-in isoform identification methods in FLAMES. Both methods have been wrapped in the `find_isoform()` function.
+
+Afterwards, the demultiplexed reads are aligned to the transcript assembly by the function `minimap_realign()`. Finally, FLAMES calls `quantify_transcript()` to quantify the transcript counts using the (re-)alignment file.
 
 Figure \@ref(fig:workflow) summerise the steps of the pipeline described above. The pipeline functions (`sc_long_pipeline`, `bulk_long_pipeline`, `sc_long_multisample_pipline`) execute the steps sequentially and return `SingleCellExperiment` object, `SummarizedExperiment` object and a list of `SingleCellExperiment` objects respectivly. 
 
@@ -47,7 +55,7 @@ To get started, the pipeline needs access to a gene annotation file in GFF3 or G
 The single cell pipeline can demultiplex the input data before running, if required. This can be disabled by setting the `do_barcode_demultiplex` argument in the config file to `FALSE` when calling the pipeline. This example dataset has already been demultiplexed.
 
 For this example, the required files are downloaded from GitHub using [BiocFileCache](http://bioconductor.org/packages/release/bioc/html/BiocFileCache.html) [@biocfilecache].
-```{r eval=TRUE, echo=TRUE}
+```{r eval=FALSE, echo=TRUE}
 temp_path <- tempfile()
 bfc <- BiocFileCache::BiocFileCache(temp_path, ask = FALSE)
 
@@ -71,30 +79,30 @@ fastq <- bfc[[names(BiocFileCache::bfcadd(
 # setup other environment variables
 outdir <- tempfile()
 dir.create(outdir)
-config_file <- FLAMES::create_config(outdir, type = "SIRV", do_barcode_demultiplex = FALSE)
+config_file <- FLAMES::create_config(outdir, type = "SIRV", do_barcode_demultiplex = TRUE)
 ```
 
-The optional argument `config_file` can be given to both `bulk_long_pipeline()` and `sc_long_pipeline()` in order to customise the execution of the pipelines. It is expected to be a JSON file, and an example can be found by calling `FLAMES::create_config`, which returns the path to a copy of the default JSON configuration file. The values from the default configs can be altered by either editing the JSON file manually, or passing additional arguments to `FLAMES::create_config`, for example, `FLAMES::create_config(outdir, type = "sc_5end", min_sup_cnt = 10)` with create a copy of the default config for 5' end nanopore single cell reads, with the `min_sup_cnt` value (minimal number of supporting reads for a novel isoform to pass filtering) changed to 10.
+The optional argument `config_file` can be given to both `bulk_long_pipeline()` and `sc_long_pipeline()` in order to customise the execution of the pipelines. It is expected to be a JSON file, and an example can be found by calling `FLAMES::create_config`, which returns the path to a copy of the default JSON configuration file. The values from the default configs can be altered by either editing the JSON file manually, or passing additional arguments to `FLAMES::create_config`, for example, `FLAMES::create_config(outdir, type = "sc_3end", min_sup_cnt = 10)` with create a copy of the default config for 10X 3' end nanopore single cell reads, with the `min_sup_cnt` value (minimal number of supporting reads for a novel isoform to pass filtering) changed to 10.
 
 This vignette uses the default configuration file created for SIRV reads.
 
 ### FLAMES execution
 
 Once the initial variables have been setup, the pipeline can be run using:
-```{r, eval=TRUE, echo=TRUE}
+```{r, eval=FALSE, echo=TRUE}
 library(FLAMES)
 # do not run if minimap2 cannot be found
 if (is.character(locate_minimap2_dir())) { 
   sce <- sc_long_pipeline(
     annotation = annot, fastq = fastq, genome_fa = genome_fa,
-    outdir = outdir, config_file = config_file)
+    outdir = outdir, config_file = config_file, expect_cell_number = 10)
 }
 ```
 
 If, however, the input fastq files need to be demultiplexed, then the `reference_csv` argument will need to be specified - `reference_csv` is the file path to a  cell barcode allow-list, as a text file with one barcode per line. The `filtered_feature_bc_matrix/barcodes.tsv.gz` from `cellranger` outputs can be used to create such allow-list, with `zcat barcodes.tsv.gz | cut -f1 -d'-' > allow-list.csv`.
 
 The pipeline can alse be run by calling the consituent steps sequentially:
-```{r, eval=TRUE, echo=TRUE}
+```{r, eval=FALSE, echo=TRUE}
 library(FLAMES)
 minimap2_dir <- locate_minimap2_dir()
 if (is.character(minimap2_dir)) { # do not run if minimap2 cannot be found
@@ -119,7 +127,7 @@ if (is.character(minimap2_dir)) { # do not run if minimap2 cannot be found
 
 ## Visulisation
 The `combine_sce()` and `sc_annotate_plots()` can be used for visulisation with experiment designs similar to `FLT-seq`. The `combine_sce()` function combines the `SingleCellExperiment` objects of 1) the short-read gene counts from the larger subsample, 2) the short-read gene counts from the smaller subsample and 3) the transcript counts from the larger subsample into a `MultiAssayExperiment` object. The `sc_annotate_plots()` function can then use it to produce annotated plots.
-```{r, eval=TRUE, message=FALSE, warning=FALSE}
+```{r, eval=FALSE, message=FALSE, warning=FALSE}
 library(FLAMES)
 library(MultiAssayExperiment)
  # load scm_lib80, scm_lib80 and scm_lib20
@@ -173,7 +181,7 @@ A basic example of the execution of the FLAMES bulk pipeline is given below. The
 To get started, the pipeline needs access to a gene annotation file in GFF3 or GTF format, a directory containing one or more FASTQ files (which will be merged as pre-processing), a genome FASTA file, as well as the file path to minimap2, and the file path to the directory to hold output files.
 
 For this example, these files are downloaded from GitHub using [BiocFileCache](http://bioconductor.org/packages/release/bioc/html/BiocFileCache.html) [@biocfilecache].
-```{r eval=TRUE, echo=TRUE}
+```{r eval=FALSE, echo=TRUE}
 temp_path <- tempfile()
 bfc <- BiocFileCache::BiocFileCache(temp_path, ask = FALSE)
 
@@ -210,7 +218,7 @@ config_file <- FLAMES::create_config(outdir)
 
 ### FLAMES execution
 Once the environment has been setup, the pipeline can be executed by running:
-```{r eval=TRUE, echo=TRUE}
+```{r eval=FALSE, echo=TRUE}
 library(FLAMES)
 if (is.character(locate_minimap2_dir())) {
   summarizedExperiment <- bulk_long_pipeline(
@@ -271,6 +279,9 @@ The `Bash` script can then be executed inside a tmux or screen session with:
 ## FLAMES on Windows
 Due to FLAMES requiring minimap2 and pysam, FLAMES is currently unavaliable on Windows.
 
+## Citation
+Please cite flames's paper if you use flames in your research. As FLAMES used incorporates BLAZE, flexiplex and minimap2, samtools, bambu. Please make sure to cite when using these tools.
+
 # Session Info
 ``` {r echo=FALSE}
 utils::sessionInfo()

diff --git a/vignettes/ref.bib b/vignettes/ref.bib
@@ -51,3 +51,33 @@ @article {flames
 	eprint = {https://www.biorxiv.org/content/early/2020/08/10/2020.08.10.243543.full.pdf},
 	journal = {bioRxiv}
 }
+
+
+@article{blaze,
+  title={Identification of cell barcodes from long-read single-cell RNA-seq with BLAZE},
+  author={You, Yupei and Prawer, Yair DJ and Paoli-Iseppi, De and Hunt, Cameron PJ and Parish, Clare L and Shim, Heejung and Clark, Michael B and others},
+  journal={Genome Biology},
+  volume={24},
+  number={1},
+  pages={1--23},
+  year={2023},
+  publisher={BioMed Central}
+}
+
+@article{flexiplex,
+  title={Flexiplex: A versatile demultiplexer and search tool for omics data},
+  author={Davidson, Nadia M and Amin, Noorul and Min Hao, Ling and Wang, Changqing and Cheng, Oliver and Goeke, Jonathan M and Ritchie, Matthew E and Wu, Shuyi},
+  journal={bioRxiv},
+  pages={2023--08},
+  year={2023},
+  publisher={Cold Spring Harbor Laboratory}
+}
+
+@article{chen2023context,
+  title={Context-aware transcript quantification from long-read RNA-seq data with Bambu},
+  author={Chen, Ying and Sim, Andre and Wan, Yuk Kei and Yeo, Keith and Lee, Joseph Jing Xian and Ling, Min Hao and Love, Michael I and G{\"o}ke, Jonathan},
+  journal={Nature Methods},
+  pages={1--9},
+  year={2023},
+  publisher={Nature Publishing Group US New York}
+}