Dev v2.1 #287 - Simplify the Gen3-DRS download option (#304)

* Update usage.md * Update run_on_sumner.md * add dockerfile for csvtoolkit * add process to convert manifest json to csv * add process to filter manifest by file passed through --reads * update help message * fix bug on variable declaration * Update nextflow.config - fix typo * Revert "Merge branch 'master' into dev-v2.1-#287" This reverts commit be2c2ab, reversing changes made to 04285ef. * Update main.nf * patch projectDir error * Fix oublishDir path for manifest * Fix oublishDir path for manifest * Fix typo * Update filter_manifest.py * Update filter_manifest.py * fix bug on saving filenames that were not in manifest file * Update filter_manifest.py * remove logging of samples not found in manifest * Update filter_manifest.py * Makes filter_manifest txt output optional Co-authored-by: angarb <62404570+angarb@users.noreply.github.com> Co-authored-by: Vlad-Dembrovskyi <64809705+Vlad-Dembrovskyi@users.noreply.github.com> Co-authored-by: Vlad-Dembrovskyi <vlad@lifebit.ai>
TheJacksonLaboratory · Feb 22, 2022 · 7bd8f1d · 7bd8f1d
1 parent 8beff37
commit 7bd8f1d
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 7 deletions.
diff --git a/bin/filter_manifest.py b/bin/filter_manifest.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+import sys
+import shutil
+import pandas as pd
+
+def __main__():
+
+    manifest = sys.argv[1]
+    reads = sys.argv[2]
+    print("Input manifest file:", manifest)
+    print("Input read file: ", reads)
+
+    manifest_df = pd.read_csv(manifest, index_col=None, header=0, delimiter=",")
+
+    if reads != "PASS":
+        # process metadata
+        reads_df = pd.read_csv(reads, index_col=None, header=0, delimiter=",")
+        manifest_df = manifest_df[manifest_df['file_name'].isin(reads_df['file_name'].tolist())]
+
+        if manifest_df.empty:
+            print("Manifest file is empty after filtering.")
+            sys.exit(404, "Manifest file is empty after filtering.")
+        else:
+            print("Number of samples in filtered manifest:")
+            print(len(manifest_df))
+
+    # save final manifest file
+    manifest_df.to_csv("filtered_manifest.csv", sep=",", index=False) 
+
+if __name__=="__main__": __main__()
diff --git a/containers/csvkit/Dockerfile b/containers/csvkit/Dockerfile
@@ -0,0 +1,7 @@
+FROM nfcore/base:1.9
+LABEL authors="ines@lifebit.ai" \
+      description="Docker image containing csvkit toolkit, including in2csv"
+
+COPY environment.yml /
+RUN conda env create -f /environment.yml && conda clean -a
+ENV PATH /opt/conda/envs/csvkit/bin:$PATH
diff --git a/containers/csvkit/environment.yml b/containers/csvkit/environment.yml
@@ -0,0 +1,9 @@
+name: csvkit
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+  - anaconda
+dependencies:
+  - python=3.8
+  - csvkit=1.0.5
diff --git a/main.nf b/main.nf
@@ -24,7 +24,10 @@ def helpMessage() {
 
     Input files:
       --reads                       Path to reads.csv file, which specifies the sample_id and path to FASTQ files
-                                    for each read or read pair (path).
+                                    for each read or read pair (path). 
+				    When using the --download_from GTEX option the reads file must be a simple csv file listing
+				    bam file names to be processed in the analysis. The input manifest will be downsampled
+				    to only contain information about these files.
                                     This file is used if starting at beginning of pipeline. It can be file paths,
                                     s3 links or ftp link.
                                     (default: no reads.csv)
@@ -42,6 +45,8 @@ def helpMessage() {
                                     false should be used to run local files on the HPC (Sumner).
                                     'TCGA' can also be used to download GDC data including HCMI data.
                                     (default: false)
+      --manifest                    Manifest file to download data from GTEX. (string)
+                                    (default: false)
       --key_file                    For downloading reads, use TCGA authentication token (TCGA) or
                                     credentials.json file in case of 'GTEX'.
                                     (default: false)
@@ -277,12 +282,12 @@ if (params.download_from) {
         .set { accession_ids }
   }
   if(download_from('gtex')){
-      Channel
-        .fromPath(params.reads)
-        .ifEmpty { exit 1, "Cannot find CSV reads file : ${params.reads}" }
-        .splitCsv(skip:1)
-        .map { md5sum, file_name, obj_id, file_size -> [md5sum, file_name, obj_id, file_size] }
-        .set { ch_gtex_gen3_ids }
+    ch_gtex_gen3_reads = params.reads ? Channel.fromPath(params.reads) : "null"
+
+    Channel
+      .fromPath(params.manifest)
+      .ifEmpty { exit 1, "Cannot find manifest file : ${params.manifest}" }
+      .set { ch_gtex_gen3_manifest }
   }
   if(download_from('ftp')){
     Channel
@@ -464,7 +469,49 @@ if ( download_from('ftp') ) {
   Download BAMs from GTEx using GEN3_DRS 
 ---------------------------------------------------*/
 
+
 if ( download_from('gtex')) {
+  process in2csv {
+    label 'tiny_memory'
+
+    input:
+    file(manifest) from ch_gtex_gen3_manifest
+
+    output:
+    file("*.csv") into ch_gtex_gen3_manifest_csv
+
+    script:
+    """
+    filename=\$(basename $manifest .json)
+    in2csv $manifest > \${filename}.csv
+    """
+  }
+
+  process filter_manifest {
+    label "tiny_memory"
+    publishDir "${params.outdir}/manifest"
+    publishDir "${params.outdir}", pattern: "*.txt"
+
+    input:
+    file(manifest) from ch_gtex_gen3_manifest_csv
+    file(reads) from ch_gtex_gen3_reads
+
+    output:
+    file("*.txt") optional true
+    file("filtered_manifest.csv") into ch_gtex_gen3_filtered_manifest_csv
+
+    script:
+    optional_reads = params.reads ? "$reads": "PASS"
+    """
+    filter_manifest.py $manifest $optional_reads
+    """
+  }
+
+  ch_gtex_gen3_filtered_manifest_csv
+    .splitCsv(skip:1)
+    .map { md5sum, file_name, obj_id, file_size -> [md5sum, file_name, obj_id, file_size] }
+    .set { ch_gtex_gen3_ids }
+
   process gen3_drs_fasp {
       tag "${file_name}"
       label 'low_memory'

diff --git a/nextflow.config b/nextflow.config
@@ -11,6 +11,7 @@ params {
     rmats_pairs    = false
     run_name	     = false
     download_from  = false
+    manifest       = false
     sra_config_file= "${baseDir}/assets/sra-user-settings.mkfg"
     key_file       = false
     genome_fasta   = false
@@ -100,6 +101,12 @@ process {
   withName: 'get_accession' {
     container = 'anczukowlab/download_reads:2.0'
   }
+  withName: 'in2csv' {
+    container = 'quay.io/lifebitai/csvkit:1.0.5'
+  }
+  withName: 'filter_manifest' {
+    container = 'quay.io/lifebitai/pcgr:python-base_1.0.0'
+  }
   withName: 'gen3_drs_fasp' {
     container = 'anczukowlab/lifebit-ai-fasp:v1.1'
   }