diff --git a/bin/filter_manifest.py b/bin/filter_manifest.py new file mode 100755 index 00000000..a52eb42b --- /dev/null +++ b/bin/filter_manifest.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +import sys +import shutil +import pandas as pd + +def __main__(): + + manifest = sys.argv[1] + reads = sys.argv[2] + print("Input manifest file:", manifest) + print("Input read file: ", reads) + + manifest_df = pd.read_csv(manifest, index_col=None, header=0, delimiter=",") + + if reads != "PASS": + # process metadata + reads_df = pd.read_csv(reads, index_col=None, header=0, delimiter=",") + manifest_df = manifest_df[manifest_df['file_name'].isin(reads_df['file_name'].tolist())] + + if manifest_df.empty: + print("Manifest file is empty after filtering.") + sys.exit(404, "Manifest file is empty after filtering.") + else: + print("Number of samples in filtered manifest:") + print(len(manifest_df)) + + # save final manifest file + manifest_df.to_csv("filtered_manifest.csv", sep=",", index=False) + +if __name__=="__main__": __main__() \ No newline at end of file diff --git a/containers/csvkit/Dockerfile b/containers/csvkit/Dockerfile new file mode 100644 index 00000000..f2d70a8b --- /dev/null +++ b/containers/csvkit/Dockerfile @@ -0,0 +1,7 @@ +FROM nfcore/base:1.9 +LABEL authors="ines@lifebit.ai" \ + description="Docker image containing csvkit toolkit, including in2csv" + +COPY environment.yml / +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH /opt/conda/envs/csvkit/bin:$PATH \ No newline at end of file diff --git a/containers/csvkit/environment.yml b/containers/csvkit/environment.yml new file mode 100644 index 00000000..bddc658b --- /dev/null +++ b/containers/csvkit/environment.yml @@ -0,0 +1,9 @@ +name: csvkit +channels: + - conda-forge + - bioconda + - defaults + - anaconda +dependencies: + - python=3.8 + - csvkit=1.0.5 diff --git a/main.nf b/main.nf index 2dd38e28..57e84486 100755 --- a/main.nf +++ b/main.nf @@ -24,7 +24,10 @@ def helpMessage() { Input files: --reads Path to reads.csv file, which specifies the sample_id and path to FASTQ files - for each read or read pair (path). + for each read or read pair (path). + When using the --download_from GTEX option the reads file must be a simple csv file listing + bam file names to be processed in the analysis. The input manifest will be downsampled + to only contain information about these files. This file is used if starting at beginning of pipeline. It can be file paths, s3 links or ftp link. (default: no reads.csv) @@ -42,6 +45,8 @@ def helpMessage() { false should be used to run local files on the HPC (Sumner). 'TCGA' can also be used to download GDC data including HCMI data. (default: false) + --manifest Manifest file to download data from GTEX. (string) + (default: false) --key_file For downloading reads, use TCGA authentication token (TCGA) or credentials.json file in case of 'GTEX'. (default: false) @@ -277,12 +282,12 @@ if (params.download_from) { .set { accession_ids } } if(download_from('gtex')){ - Channel - .fromPath(params.reads) - .ifEmpty { exit 1, "Cannot find CSV reads file : ${params.reads}" } - .splitCsv(skip:1) - .map { md5sum, file_name, obj_id, file_size -> [md5sum, file_name, obj_id, file_size] } - .set { ch_gtex_gen3_ids } + ch_gtex_gen3_reads = params.reads ? Channel.fromPath(params.reads) : "null" + + Channel + .fromPath(params.manifest) + .ifEmpty { exit 1, "Cannot find manifest file : ${params.manifest}" } + .set { ch_gtex_gen3_manifest } } if(download_from('ftp')){ Channel @@ -464,7 +469,49 @@ if ( download_from('ftp') ) { Download BAMs from GTEx using GEN3_DRS ---------------------------------------------------*/ + if ( download_from('gtex')) { + process in2csv { + label 'tiny_memory' + + input: + file(manifest) from ch_gtex_gen3_manifest + + output: + file("*.csv") into ch_gtex_gen3_manifest_csv + + script: + """ + filename=\$(basename $manifest .json) + in2csv $manifest > \${filename}.csv + """ + } + + process filter_manifest { + label "tiny_memory" + publishDir "${params.outdir}/manifest" + publishDir "${params.outdir}", pattern: "*.txt" + + input: + file(manifest) from ch_gtex_gen3_manifest_csv + file(reads) from ch_gtex_gen3_reads + + output: + file("*.txt") optional true + file("filtered_manifest.csv") into ch_gtex_gen3_filtered_manifest_csv + + script: + optional_reads = params.reads ? "$reads": "PASS" + """ + filter_manifest.py $manifest $optional_reads + """ + } + + ch_gtex_gen3_filtered_manifest_csv + .splitCsv(skip:1) + .map { md5sum, file_name, obj_id, file_size -> [md5sum, file_name, obj_id, file_size] } + .set { ch_gtex_gen3_ids } + process gen3_drs_fasp { tag "${file_name}" label 'low_memory' diff --git a/nextflow.config b/nextflow.config index 166e8bae..af748620 100755 --- a/nextflow.config +++ b/nextflow.config @@ -11,6 +11,7 @@ params { rmats_pairs = false run_name = false download_from = false + manifest = false sra_config_file= "${baseDir}/assets/sra-user-settings.mkfg" key_file = false genome_fasta = false @@ -100,6 +101,12 @@ process { withName: 'get_accession' { container = 'anczukowlab/download_reads:2.0' } + withName: 'in2csv' { + container = 'quay.io/lifebitai/csvkit:1.0.5' + } + withName: 'filter_manifest' { + container = 'quay.io/lifebitai/pcgr:python-base_1.0.0' + } withName: 'gen3_drs_fasp' { container = 'anczukowlab/lifebit-ai-fasp:v1.1' }