Skip to content

Commit

Permalink
add process to filter manifest by file passed through --reads
Browse files Browse the repository at this point in the history
  • Loading branch information
imendes93 committed Dec 23, 2021
1 parent 5c36a60 commit 1ad094b
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 12 deletions.
31 changes: 31 additions & 0 deletions bin/filter_manifest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import sys
import shutil
import pandas as pd

def __main__():

manifest = sys.argv[1]
reads = sys.argv[2]
print("Input manifest file:", manifest)
print("Input read file: ", reads)

manifest_df = pd.read_csv(manifest, index_col=None, header=0, delimiter=",")

if reads != "PASS":
# process metadata
reads_df = pd.read_csv(reads, index_col=None, header=0, delimiter=",")
manifest_df = manifest_df[manifest_df['file_name'].isin(reads_df['file_name'].tolist())]

if manifest_df.empty:
sys.exit(404, "Manifest file is empty after filtering.")

if len(reads_df[~reads_df['file_name'].isin([manifest_df['file_name'].tolist()])])>0:
print("The following file_name IDs where not found in manifest:")
print(reads_df[~reads_df['file_name'].isin([manifest_df['file_name'].tolist()])])
reads_df[~reads_df['file_name'].isin([manifest_df['file_name'].tolist()])].to_csv("not_found_GTEX_samples.txt", index=False)

# save final manifest file
manifest_df.to_csv("filtered_manifest.csv", sep=",", index=False)

if __name__=="__main__": __main__()
41 changes: 29 additions & 12 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -277,18 +277,12 @@ if (params.download_from) {
.set { accession_ids }
}
if(download_from('gen3-drs')){
// TODO - Reads parameter optional. If not provided use all data in manifest
Channel
.fromPath(params.reads)
.ifEmpty { exit 1, "Cannot find CSV reads file : ${params.reads}" }
.splitCsv(skip:1)
.map { file_name -> [file_name] }
.set { ch_gtex_gen3_reads }
ch_gtex_gen3_reads = params.reads ? Channel.fromPath(params.reads) : "null"

Channel
.fromPath(params.manifest)
.ifEmpty { exit 1, "Cannot find manifest file : ${params.manifest}" }
.set { ch_gtex_gen3_manifest }
Channel
.fromPath(params.manifest)
.ifEmpty { exit 1, "Cannot find manifest file : ${params.manifest}" }
.set { ch_gtex_gen3_manifest }
}
if(download_from('ftp')){
Channel
Expand Down Expand Up @@ -368,6 +362,7 @@ if ( download_from('gen3-drs')) {
.fromPath(params.genome_fasta)
.ifEmpty { exit 1, "${params.genome_fasta} is not present" }
.set {ch_genome_fasta}
filter_manifest_py = Channel.fromPath("${projectDir}/bin/filter_manifest.py", type: 'file', followLinks: false)
}

if ( download_from('sra')) {
Expand Down Expand Up @@ -487,7 +482,29 @@ if ( download_from('gen3-drs')) {
in2csv $manifest > \${filename}.csv
"""
}
ch_gtex_gen3_manifest_csv.

process filter_manifest {
label "tiny_memory"
publishDir "${params.outdir}/process-logs/${task.process}/${file(file_name).baseName}", pattern: "command-logs-*", mode: 'copy'
publishDir "${params.outdir}", pattern: "*.txt"

input:
file(manifest) from ch_gtex_gen3_manifest_csv
file(reads) from ch_gtex_gen3_reads
each file("filter_manifest.py") from filter_manifest_py

output:
file("*.txt")
file("filtered_manifest.csv") into ch_gtex_gen3_filtered_manifest_csv

script:
optional_reads = params.reads ? "$reads": "PASS"
"""
filter_manifest.py $manifest $optional_reads
"""
}

ch_gtex_gen3_filtered_manifest_csv.
.splitCsv(skip:1)
.map { md5sum, file_name, obj_id, file_size -> [md5sum, file_name, obj_id, file_size] }
.set { ch_gtex_gen3_ids }
Expand Down
3 changes: 3 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ process {
withName: 'in2csv' {
container = 'quay.io/lifebitai/csvkit:1.0.5'
}
withName: 'filter_manifest' {
container = 'quay.io/lifebitai/pcgr:python-base_1.0.0'
}
withName: 'gen3_drs_fasp' {
container = 'anczukowlab/lifebit-ai-fasp:v1.1'
}
Expand Down

0 comments on commit 1ad094b

Please sign in to comment.