Skip to content

Commit

Permalink
Dev v2.1 #287 - Simplify the Gen3-DRS download option (#304)
Browse files Browse the repository at this point in the history
* Update usage.md

* Update run_on_sumner.md

* add dockerfile for csvtoolkit

* add process to convert manifest json to csv

* add process to filter manifest by file passed through --reads

* update help message

* fix bug on variable declaration

* Update nextflow.config - fix typo

* Revert "Merge branch 'master' into dev-v2.1-#287"

This reverts commit be2c2ab, reversing
changes made to 04285ef.

* Update main.nf

* patch projectDir error

* Fix oublishDir path for manifest

* Fix oublishDir path for manifest

* Fix typo

* Update filter_manifest.py

* Update filter_manifest.py

* fix bug on saving filenames that were not in manifest file

* Update filter_manifest.py

* remove logging of samples not found in manifest

* Update filter_manifest.py

* Makes filter_manifest txt output optional

Co-authored-by: angarb <62404570+angarb@users.noreply.github.com>
Co-authored-by: Vlad-Dembrovskyi <64809705+Vlad-Dembrovskyi@users.noreply.github.com>
Co-authored-by: Vlad-Dembrovskyi <vlad@lifebit.ai>
  • Loading branch information
4 people authored Feb 22, 2022
1 parent 8beff37 commit 7bd8f1d
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 7 deletions.
33 changes: 33 additions & 0 deletions bin/filter_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import sys
import shutil
import pandas as pd

def __main__():

manifest = sys.argv[1]
reads = sys.argv[2]
print("Input manifest file:", manifest)
print("Input read file: ", reads)

manifest_df = pd.read_csv(manifest, index_col=None, header=0, delimiter=",")

if reads != "PASS":
# process metadata
reads_df = pd.read_csv(reads, index_col=None, header=0, delimiter=",")
manifest_df = manifest_df[manifest_df['file_name'].isin(reads_df['file_name'].tolist())]

if manifest_df.empty:
print("Manifest file is empty after filtering.")
sys.exit(404, "Manifest file is empty after filtering.")
else:
print("Number of samples in filtered manifest:")
print(len(manifest_df))

# save final manifest file
manifest_df.to_csv("filtered_manifest.csv", sep=",", index=False)

if __name__=="__main__": __main__()
7 changes: 7 additions & 0 deletions containers/csvkit/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM nfcore/base:1.9
LABEL authors="ines@lifebit.ai" \
description="Docker image containing csvkit toolkit, including in2csv"

COPY environment.yml /
RUN conda env create -f /environment.yml && conda clean -a
ENV PATH /opt/conda/envs/csvkit/bin:$PATH
9 changes: 9 additions & 0 deletions containers/csvkit/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: csvkit
channels:
- conda-forge
- bioconda
- defaults
- anaconda
dependencies:
- python=3.8
- csvkit=1.0.5
61 changes: 54 additions & 7 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ def helpMessage() {
Input files:
--reads Path to reads.csv file, which specifies the sample_id and path to FASTQ files
for each read or read pair (path).
for each read or read pair (path).
When using the --download_from GTEX option the reads file must be a simple csv file listing
bam file names to be processed in the analysis. The input manifest will be downsampled
to only contain information about these files.
This file is used if starting at beginning of pipeline. It can be file paths,
s3 links or ftp link.
(default: no reads.csv)
Expand All @@ -42,6 +45,8 @@ def helpMessage() {
false should be used to run local files on the HPC (Sumner).
'TCGA' can also be used to download GDC data including HCMI data.
(default: false)
--manifest Manifest file to download data from GTEX. (string)
(default: false)
--key_file For downloading reads, use TCGA authentication token (TCGA) or
credentials.json file in case of 'GTEX'.
(default: false)
Expand Down Expand Up @@ -277,12 +282,12 @@ if (params.download_from) {
.set { accession_ids }
}
if(download_from('gtex')){
Channel
.fromPath(params.reads)
.ifEmpty { exit 1, "Cannot find CSV reads file : ${params.reads}" }
.splitCsv(skip:1)
.map { md5sum, file_name, obj_id, file_size -> [md5sum, file_name, obj_id, file_size] }
.set { ch_gtex_gen3_ids }
ch_gtex_gen3_reads = params.reads ? Channel.fromPath(params.reads) : "null"

Channel
.fromPath(params.manifest)
.ifEmpty { exit 1, "Cannot find manifest file : ${params.manifest}" }
.set { ch_gtex_gen3_manifest }
}
if(download_from('ftp')){
Channel
Expand Down Expand Up @@ -464,7 +469,49 @@ if ( download_from('ftp') ) {
Download BAMs from GTEx using GEN3_DRS
---------------------------------------------------*/


if ( download_from('gtex')) {
process in2csv {
label 'tiny_memory'

input:
file(manifest) from ch_gtex_gen3_manifest

output:
file("*.csv") into ch_gtex_gen3_manifest_csv

script:
"""
filename=\$(basename $manifest .json)
in2csv $manifest > \${filename}.csv
"""
}

process filter_manifest {
label "tiny_memory"
publishDir "${params.outdir}/manifest"
publishDir "${params.outdir}", pattern: "*.txt"

input:
file(manifest) from ch_gtex_gen3_manifest_csv
file(reads) from ch_gtex_gen3_reads

output:
file("*.txt") optional true
file("filtered_manifest.csv") into ch_gtex_gen3_filtered_manifest_csv

script:
optional_reads = params.reads ? "$reads": "PASS"
"""
filter_manifest.py $manifest $optional_reads
"""
}

ch_gtex_gen3_filtered_manifest_csv
.splitCsv(skip:1)
.map { md5sum, file_name, obj_id, file_size -> [md5sum, file_name, obj_id, file_size] }
.set { ch_gtex_gen3_ids }

process gen3_drs_fasp {
tag "${file_name}"
label 'low_memory'
Expand Down
7 changes: 7 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ params {
rmats_pairs = false
run_name = false
download_from = false
manifest = false
sra_config_file= "${baseDir}/assets/sra-user-settings.mkfg"
key_file = false
genome_fasta = false
Expand Down Expand Up @@ -100,6 +101,12 @@ process {
withName: 'get_accession' {
container = 'anczukowlab/download_reads:2.0'
}
withName: 'in2csv' {
container = 'quay.io/lifebitai/csvkit:1.0.5'
}
withName: 'filter_manifest' {
container = 'quay.io/lifebitai/pcgr:python-base_1.0.0'
}
withName: 'gen3_drs_fasp' {
container = 'anczukowlab/lifebit-ai-fasp:v1.1'
}
Expand Down

0 comments on commit 7bd8f1d

Please sign in to comment.