diff --git a/bin/filter_manifest.py b/bin/filter_manifest.py new file mode 100644 index 00000000..555a607a --- /dev/null +++ b/bin/filter_manifest.py @@ -0,0 +1,31 @@ +import os +import sys +import shutil +import pandas as pd + +def __main__(): + + manifest = sys.argv[1] + reads = sys.argv[2] + print("Input manifest file:", manifest) + print("Input read file: ", reads) + + manifest_df = pd.read_csv(manifest, index_col=None, header=0, delimiter=",") + + if reads != "PASS": + # process metadata + reads_df = pd.read_csv(reads, index_col=None, header=0, delimiter=",") + manifest_df = manifest_df[manifest_df['file_name'].isin(reads_df['file_name'].tolist())] + + if manifest_df.empty: + sys.exit(404, "Manifest file is empty after filtering.") + + if len(reads_df[~reads_df['file_name'].isin([manifest_df['file_name'].tolist()])])>0: + print("The following file_name IDs where not found in manifest:") + print(reads_df[~reads_df['file_name'].isin([manifest_df['file_name'].tolist()])]) + reads_df[~reads_df['file_name'].isin([manifest_df['file_name'].tolist()])].to_csv("not_found_GTEX_samples.txt", index=False) + + # save final manifest file + manifest_df.to_csv("filtered_manifest.csv", sep=",", index=False) + +if __name__=="__main__": __main__() \ No newline at end of file diff --git a/main.nf b/main.nf index 8a381059..2574c807 100755 --- a/main.nf +++ b/main.nf @@ -277,18 +277,12 @@ if (params.download_from) { .set { accession_ids } } if(download_from('gen3-drs')){ - // TODO - Reads parameter optional. If not provided use all data in manifest - Channel - .fromPath(params.reads) - .ifEmpty { exit 1, "Cannot find CSV reads file : ${params.reads}" } - .splitCsv(skip:1) - .map { file_name -> [file_name] } - .set { ch_gtex_gen3_reads } + ch_gtex_gen3_reads = params.reads ? Channel.fromPath(params.reads) : "null" - Channel - .fromPath(params.manifest) - .ifEmpty { exit 1, "Cannot find manifest file : ${params.manifest}" } - .set { ch_gtex_gen3_manifest } + Channel + .fromPath(params.manifest) + .ifEmpty { exit 1, "Cannot find manifest file : ${params.manifest}" } + .set { ch_gtex_gen3_manifest } } if(download_from('ftp')){ Channel @@ -368,6 +362,7 @@ if ( download_from('gen3-drs')) { .fromPath(params.genome_fasta) .ifEmpty { exit 1, "${params.genome_fasta} is not present" } .set {ch_genome_fasta} + filter_manifest_py = Channel.fromPath("${projectDir}/bin/filter_manifest.py", type: 'file', followLinks: false) } if ( download_from('sra')) { @@ -487,7 +482,29 @@ if ( download_from('gen3-drs')) { in2csv $manifest > \${filename}.csv """ } - ch_gtex_gen3_manifest_csv. + + process filter_manifest { + label "tiny_memory" + publishDir "${params.outdir}/process-logs/${task.process}/${file(file_name).baseName}", pattern: "command-logs-*", mode: 'copy' + publishDir "${params.outdir}", pattern: "*.txt" + + input: + file(manifest) from ch_gtex_gen3_manifest_csv + file(reads) from ch_gtex_gen3_reads + each file("filter_manifest.py") from filter_manifest_py + + output: + file("*.txt") + file("filtered_manifest.csv") into ch_gtex_gen3_filtered_manifest_csv + + script: + optional_reads = params.reads ? "$reads": "PASS" + """ + filter_manifest.py $manifest $optional_reads + """ + } + + ch_gtex_gen3_filtered_manifest_csv. .splitCsv(skip:1) .map { md5sum, file_name, obj_id, file_size -> [md5sum, file_name, obj_id, file_size] } .set { ch_gtex_gen3_ids } diff --git a/nextflow.config b/nextflow.config index cc35b99d..f51554ce 100755 --- a/nextflow.config +++ b/nextflow.config @@ -103,6 +103,9 @@ process { withName: 'in2csv' { container = 'quay.io/lifebitai/csvkit:1.0.5' } + withName: 'filter_manifest' { + container = 'quay.io/lifebitai/pcgr:python-base_1.0.0' + } withName: 'gen3_drs_fasp' { container = 'anczukowlab/lifebit-ai-fasp:v1.1' }