Merge pull request #153 from ucsd-ccbb/putative_3.9

Putative 3.9
ucsd-ccbb · Apr 26, 2022 · b43fbc7 · b43fbc7
2 parents a6ac809 + a722671
commit b43fbc7
Show file tree

Hide file tree

Showing 17 changed files with 275 additions and 175 deletions.
diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
@@ -38,8 +38,8 @@ jobs:
         conda install flake8
         # stop the build if there are Python syntax errors or undefined names
         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        # The GitHub editor is 127 chars wide
+        flake8 . --count --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
         conda install pytest

diff --git a/pipeline/document_file_checksums.py b/pipeline/document_file_checksums.py
@@ -0,0 +1,48 @@
+import os
+from hashlib import md5
+from sys import argv
+
+
+def generate_input_checksums(workspace_fp, fsuffixes_to_include=None):
+    result = {}
+
+    for base_fp, _, fnames in os.walk(workspace_fp):
+        for curr_fname in fnames:
+            if curr_fname.startswith("."):
+                continue
+            elif fsuffixes_to_include:
+                matches = [x for x in fsuffixes_to_include if x in curr_fname]
+                if len(matches) == 0:
+                    continue
+
+            fpath = os.path.join(base_fp, curr_fname)
+            with open(fpath, "rb") as f:
+                filebytes = f.read()
+                filehash = md5(filebytes)
+                filehash_hex = filehash.hexdigest()
+                if curr_fname in result:
+                    raise ValueError(f"file name {curr_fname} found >1 time")
+                result[curr_fname] = filehash_hex
+
+    return result
+
+
+def generate_checksums_file(args_list):
+    input_dir = args_list[1]
+    output_fp = args_list[2]
+    fnames_to_include = None if len(args_list) < 4 else args_list[3:]
+
+    fname_and_checksum_dict = generate_input_checksums(
+        input_dir, fnames_to_include)
+
+    output_lines = ["file_name,md5_hex_checksum\n"]
+    output_entries = [f"{k},{fname_and_checksum_dict[k]}\n" for k
+                      in sorted(fname_and_checksum_dict)]
+    output_lines.extend(output_entries)
+
+    with open(output_fp, "w") as f:
+        f.writelines(output_lines)
+
+
+if __name__ == '__main__':
+    generate_checksums_file(argv)
diff --git a/pipeline/lineages.sh b/pipeline/lineages.sh
@@ -11,8 +11,6 @@ INSPECT_METADATA_PATTERN="all_samples_search_ids_"
 THREADS=8
 rm -rf $WORKSPACE
 mkdir -p $WORKSPACE
-mkdir -p $WORKSPACE/passQC
-# mkdir -p $WORKSPACE/loose_stringent
 mkdir -p $WORKSPACE/stringent
 
 # Based on inputs, decide what to download from where
@@ -83,18 +81,24 @@ runLineages () {
 
   # add the ref and historic fast to the passing fas from all the sequencing runs
   cat $WORKSPACE/*passQC.fas >> $WORKSPACE/"$PROCESSINGID"_passQC.fas
-  cat $WORKSPACE/"$PROCESSINGID"_passQC.fas $WORKSPACE/"$PROCESSINGID"_refs_hist.fas >> $WORKSPACE/passQC/"$PROCESSINGID"_passQC_refs_hist.fas
+  cat $WORKSPACE/"$PROCESSINGID"_passQC.fas $WORKSPACE/"$PROCESSINGID"_refs_hist.fas >> $WORKSPACE/"$PROCESSINGID"_passQC_refs_hist.fas
 
 	# pangolin
 	source $ANACONDADIR/activate pangolin
 	pangolin --update
-	pangolin -t $THREADS --analysis-mode fast --outfile $WORKSPACE/"$PROCESSINGID".lineage_report.csv $WORKSPACE/passQC/"$PROCESSINGID"_passQC_refs_hist.fas
+	pangolin -t $THREADS --analysis-mode fast --outfile $WORKSPACE/"$PROCESSINGID".lineage_report.csv $WORKSPACE/"$PROCESSINGID"_passQC_refs_hist.fas
   echo -e "pangolin exit code: $?" >> $WORKSPACE/"$PROCESSINGID"-lineages.exit.log
 
   # deactivate the pangolin environment and re-activate the main pipeline environment
   source $ANACONDADIR/deactivate pangolin
   source $ANACONDADIR/activate covid1.2
 
+  # generate file of input checksums, for record-keeping
+  python $PIPELINEDIR/pipeline/document_file_checksums.py \
+    $WORKSPACE $WORKSPACE/"$PROCESSINGID"_input_checksums.csv \
+    "-passQC.fas" "-summary.csv"
+  echo -e "document_file_checksums.py exit code: $?" >> $WORKSPACE/"$PROCESSINGID"-lineages.exit.log
+
   # produce merged qc_and_lineages.csv
   python $PIPELINEDIR/qc/lineages_summary.py $WORKSPACE/added_fa_names.txt $WORKSPACE "-summary.csv" $WORKSPACE/"$PROCESSINGID".lineage_report.csv $WORKSPACE/"$PROCESSINGID".qc_and_lineages.csv
   echo -e "lineages_summary.py exit code: $?" >> $WORKSPACE/"$PROCESSINGID"-lineages.exit.log
@@ -122,22 +126,18 @@ runLineages () {
     $WORKSPACE/$INSPECT_METADATA_FNAME \
     $WORKSPACE/"$PROCESSINGID"_passQC.fas \
     $WORKSPACE/stringent/"$PROCESSINGID"_stringent_only.fas \
-    $WORKSPACE/passQC/"$PROCESSINGID"_passQC_refs_hist_empress_metadata.tsv \
     $WORKSPACE/stringent/"$PROCESSINGID"_stringent_refs_hist_empress_metadata.tsv
 
   echo -e "tree_prep.py exit code: $?" >> $WORKSPACE/"$PROCESSINGID"-lineages.exit.log
 
   # add the refs_hist.fas to the stringent_only.fas
   cat $WORKSPACE/"$PROCESSINGID"_refs_hist.fas $WORKSPACE/stringent/"$PROCESSINGID"_stringent_only.fas >> $WORKSPACE/stringent/"$PROCESSINGID"_stringent_refs_hist.fas
 
-  # add the loose_only.fas to the stringent_refs_hist.fas
-  # cat $WORKSPACE/stringent/"$PROCESSINGID"_stringent_refs_hist.fas $WORKSPACE/loose_stringent/"$PROCESSINGID"_loose_only.fas >> $WORKSPACE/loose_stringent/"$PROCESSINGID"_loose_stringent_refs_hist.fas
-
   aws s3 cp $WORKSPACE/"$PROCESSINGID".full_summary.csv $S3INSPECT/"$PROCESSINGID".full_summary.csv
 }
 
 { time ( runLineages ) ; } > $WORKSPACE/"$PROCESSINGID"-lineages.log 2>&1
 aws s3 cp $WORKSPACE/"$PROCESSINGID"-lineages.log $S3UPLOAD/phylogeny/$PROCESSINGID/\
 
 grep -v "exit code: 0" $WORKSPACE/"$PROCESSINGID"-lineages.exit.log | head -n 1 >> $WORKSPACE/"$PROCESSINGID"-lineages.error.log
-aws s3 cp $WORKSPACE/ $S3UPLOAD/phylogeny/$PROCESSINGID/ --recursive --quiet
+aws s3 cp $WORKSPACE/ $S3UPLOAD/phylogeny/$PROCESSINGID/ --recursive --quiet --include "*"  --exclude "*.fas" --exclude "*-summary.csv" --include "*_passQC_refs_hist.fas" --include "*_stringent_refs_hist.fas"
diff --git a/pipeline/trim_msa.py b/pipeline/trim_msa.py
@@ -4,7 +4,8 @@
 Trim the ends of a multiple sequence alignment (FASTA)
 '''
 import argparse
-from sys import stdin,stdout
+from sys import stdin, stdout
+
 
 # parse arguments
 def parseArgs():
@@ -16,22 +17,26 @@ def parseArgs():
     args = parser.parse_args()
     return args.input, args.output, args.start, args.end
 
+
 # main code execution
 infile, outfile, start, end = parseArgs()
 assert start >= 0, "Length to trim from start must be non-negative integer"
 assert end >= 0, "Length to trim from end must be non-negative integer"
-curr_ID = None; curr_seq = None
+curr_ID = None
+curr_seq = None
 for line in infile:
-    l = line.strip()
-    if len(l) == 0:
+    stripped_line = line.strip()
+    if len(stripped_line) == 0:
         continue
-    if l[0] == '>':
+    if stripped_line[0] == '>':
         if curr_ID is not None:
             outfile.write("%s\n%s\n" % (curr_ID, curr_seq[start:-end]))
-        curr_ID = l; curr_seq = ''
+        curr_ID = stripped_line
+        curr_seq = ''
     else:
         assert curr_seq is not None, "Invalid input FASTA"
-        curr_seq += l
+        curr_seq += stripped_line
 if curr_ID is not None:
     outfile.write("%s\n%s\n" % (curr_ID, curr_seq[start:-end]))
-infile.close(); outfile.close()
+infile.close()
+outfile.close()
diff --git a/qc/custom_gen_stats_multiqc.py b/qc/custom_gen_stats_multiqc.py
@@ -115,27 +115,7 @@ def gather_p25_ins_sizes(qmap_file_list_fp):
     return data_dict
 
 
-def insert_pct_gte_q30_in_sample_dict(data_dict, name, pctQ30):
-    temp_sample_dict = data_dict.get(name, dict())
-    if pctQ30 != NA_VAL:
-        pctQ30 = round(pctQ30, 3)
-    temp_sample_dict[PCT_30_KEY] = pctQ30
-    return temp_sample_dict
-
-
-def insert_uncapped_reads_in_sample_dict(data_dict, name, uncappedReads):
-    temp_sample_dict = data_dict.get(name, dict())
-    temp_sample_dict[UNCAPPED_READS_KEY] = uncappedReads
-    return temp_sample_dict
-
-
-def insert_sub_map_pct_in_sample_dict(data_dict, name, pctSubMap):
-    temp_sample_dict = data_dict.get(name, dict())
-    temp_sample_dict[SUBSAMPLED_MAPPED_PCT_ALIGNED_KEY] = pctSubMap
-    return temp_sample_dict
-
-
-def generate_q30_based_values(input_dict, R1, S1, R2=None, S2=None):
+def insert_q30_based_values(input_dict, R1, S1, R2=None, S2=None):
     name = get_sequenced_pool_component_id(R1)
 
     if R2 is not None:
@@ -156,12 +136,13 @@ def generate_q30_based_values(input_dict, R1, S1, R2=None, S2=None):
         print(f"Warning: Unable to calculate values from q30 file due "
               f"to division by zero; reporting as {NA_VAL}")
 
-    temp_pctQ30_dict = insert_pct_gte_q30_in_sample_dict(
-        input_dict, name, pctQ30)
-    temp_uncapped_reads_dict = insert_uncapped_reads_in_sample_dict(
-        input_dict, name, uncapped_reads)
+    temp_sample_dict = input_dict.get(name, dict())
 
-    return name, temp_pctQ30_dict, temp_uncapped_reads_dict
+    if pctQ30 != NA_VAL:
+        pctQ30 = round(pctQ30, 3)
+    temp_sample_dict[PCT_30_KEY] = pctQ30
+    temp_sample_dict[UNCAPPED_READS_KEY] = uncapped_reads
+    input_dict[name] = temp_sample_dict
 
 
 def gather_pct_gte_q30(q30_file_list_fp, se_or_pe, data_dict):
@@ -172,23 +153,13 @@ def gather_pct_gte_q30(q30_file_list_fp, se_or_pe, data_dict):
     if se_or_pe == SE_VALUE:
         for R1 in q30s_paths:
             S1 = parseSeqQual(R1)
-
-            name, temp_pctQ30_dict, temp_uncapped_reads_dict = \
-                generate_q30_based_values(data_dict, R1, S1)
-
-            data_dict[name] = temp_pctQ30_dict
-            data_dict[name] = temp_uncapped_reads_dict
+            insert_q30_based_values(data_dict, R1, S1)
     elif se_or_pe == PE_VALUE:
         # Get Q30s for both R1 and R2
         for R1, R2 in pairwise(q30s_paths):
             S1 = parseSeqQual(R1)
             S2 = parseSeqQual(R2)
-
-            name, temp_pctQ30_dict, temp_uncapped_reads_dict = \
-                generate_q30_based_values(data_dict, R1, S1, R2, S2)
-
-            data_dict[name] = temp_pctQ30_dict
-            data_dict[name] = temp_uncapped_reads_dict
+            insert_q30_based_values(data_dict, R1, S1, R2, S2)
     else:
         print(f"Warning: Unable to run generate percent greater than q30 and "
               f"number of uncapped reads for input type '{se_or_pe}'")
@@ -216,10 +187,9 @@ def gather_sub_map_pct(sub_map_stats_file_list_fp, data_dict):
         reads = parseSubMapStats(sample)  # format: [Mapped, Unmapped]
         pctAligned = _calc_pct_aligned(reads)
 
-        temp_pctAligned_dict = insert_sub_map_pct_in_sample_dict(
-            data_dict, name, pctAligned)
-
-        data_dict[name] = temp_pctAligned_dict
+        temp_sample_dict = data_dict.get(name, dict())
+        temp_sample_dict[SUBSAMPLED_MAPPED_PCT_ALIGNED_KEY] = pctAligned
+        data_dict[name] = temp_sample_dict
 
     return data_dict
 

diff --git a/qc/metadata_generation.py b/qc/metadata_generation.py
@@ -12,6 +12,7 @@
 SUBMIT_TO_GISAID = "submit_to_gisaid"
 INDEX_COL_NAME = "Unnamed: 0"
 SUBJ_AGE = "subject_age"
+METADATA_CLEARED = "metadata_cleared"
 
 BJORN_COL_NAMES = ["Sample ID", "SEARCH SampleID", "Ready for release?",
                    "New sequences ready for release", "Released?",
@@ -29,7 +30,7 @@
                    "Sequenced Pool Component Id", "Variant File S3 URL",
                    "Consensus File S3 URL", "BAM File S3 URL",
                    "Source", "Sequencing Run", "Overall Fail",
-                   "Inspect Submit-to-GISAID"]
+                   "Inspect Submit-to-GISAID", METADATA_CLEARED]
 
 
 def filter_metadata_for_bjorn(full_df):
@@ -72,11 +73,14 @@ def generate_bjorn_df(filtered_df):
     # The metadata reads in as strings (hence 'True') while the qc and lineage
     # data reads in parsed (hence False, not 'False')
     inspect_approval = filtered_df[SUBMIT_TO_GISAID] == 'True'  # noqa 712
+    # NB: if sample's metadata_cleared is NA, sample IS allowed for release.
+    # This is because we have no knowledge about whether that metadata is bad.
+    metadata_bad = filtered_df[METADATA_CLEARED] == 'False'  # noqa 712
     # NB: this check is false for overall_fail values of None, as it should be:
     # don't release items with overall_fail == None since they are unknown
     no_overall_fail = filtered_df[OVERALL_FAIL] == False  # noqa 712
 
-    release_mask = inspect_approval & no_overall_fail
+    release_mask = inspect_approval & no_overall_fail & ~metadata_bad
     output_df.loc[:, "ready_for_release"] = "No"
     output_df.loc[release_mask, "ready_for_release"] = "Yes"
 
@@ -136,6 +140,7 @@ def generate_bjorn_df(filtered_df):
     output_df.loc[:, SEQ_RUN] = filtered_df[SEQ_RUN]
     output_df.loc[:, OVERALL_FAIL] = filtered_df[OVERALL_FAIL]
     output_df.loc[:, SUBMIT_TO_GISAID] = filtered_df[SUBMIT_TO_GISAID]
+    output_df.loc[:, METADATA_CLEARED] = filtered_df[METADATA_CLEARED]
 
     return output_df
 

diff --git a/qc/qc_summary.sh b/qc/qc_summary.sh
@@ -94,6 +94,12 @@ runQC () {
     echo -e "subset_csv.py exit code: $?" >> $WORKSPACE/"$SEQ_RUN"-qc.exit.log
     cat $PASSING_CONS_FNAMES > $WORKSPACE/"$SEQ_RUN"-passQC.fas
 
+  # generate file of summary and passQC fas checksums, for record-keeping
+  python $PIPELINEDIR/pipeline/document_file_checksums.py \
+    $WORKSPACE $WORKSPACE/"$SEQ_RUN"_artifact_checksums.csv \
+    "$SEQ_RUN-summary.csv" "$SEQ_RUN-passQC.fas"
+  echo -e "document_file_checksums.py exit code: $?" >> $WORKSPACE/"$SEQ_RUN"-qc.exit.log
+
 	# Exit codes
 	echo "Gathering per-sample exit codes."
 	cat $WORKSPACE/*/*error.log > $WORKSPACE/"$SEQ_RUN".error.log
@@ -109,7 +115,7 @@ runQC () {
 	# cumulative data folder
 	S3CUMULATIVE=$S3DOWNLOAD
 	aws s3 cp $WORKSPACE/"$SEQ_RUN"-passQC.fas $S3CUMULATIVE/phylogeny/cumulative_data/consensus/
-	aws s3 cp $WORKSPACE/"$SEQ_RUN".fas $S3CUMULATIVE/phylogeny/cumulative_data/consensus/
+	#aws s3 cp $WORKSPACE/"$SEQ_RUN".fas $S3CUMULATIVE/phylogeny/cumulative_data/consensus/
 	aws s3 cp $WORKSPACE/"$SEQ_RUN"-summary.csv $S3CUMULATIVE/phylogeny/cumulative_data/consensus/
 }
 

diff --git a/qc/tree_prep.py b/qc/tree_prep.py
@@ -56,8 +56,9 @@ def prep_files_for_tree_building(arg_list):
     metadata_fp = arg_list[2]  # May be None
     full_fasta_fp = arg_list[3]
     out_stringent_fasta_fp = arg_list[4]
-    out_empress_fp = arg_list[5]
-    out_stringent_empress_fp = arg_list[6]
+    out_stringent_empress_fp = arg_list[5]
+    # optional 7th argument for passQC (not just stringent) empress metadata
+    out_empress_fp = None if len(arg_list) <= 6 else arg_list[6]
 
     qc_and_lineage_df = pd.read_csv(qc_and_lineage_fp)  # , dtype=str)
 
@@ -75,8 +76,9 @@ def prep_files_for_tree_building(arg_list):
         raw_empress_df = qc_and_lineage_df.copy()
 
     base_empress_df = generate_base_empress_df(raw_empress_df)
-    # NB that empress metadata files must be tsv
-    base_empress_df.to_csv(out_empress_fp, sep='\t', index=False)
+    if out_empress_fp:
+        # NB that empress metadata files must be tsv
+        base_empress_df.to_csv(out_empress_fp, sep='\t', index=False)
 
     stringent_mask = \
         base_empress_df[STRINGENT_TEST_COL] == STRINGENT_INCLUDE_VAL