Skip to content

Commit

Permalink
Merge pull request #153 from ucsd-ccbb/putative_3.9
Browse files Browse the repository at this point in the history
Putative 3.9
  • Loading branch information
AmandaBirmingham authored Apr 26, 2022
2 parents a6ac809 + a722671 commit b43fbc7
Show file tree
Hide file tree
Showing 17 changed files with 275 additions and 175 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/python-package-conda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ jobs:
conda install flake8
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
# The GitHub editor is 127 chars wide
flake8 . --count --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
conda install pytest
Expand Down
48 changes: 48 additions & 0 deletions pipeline/document_file_checksums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
from hashlib import md5
from sys import argv


def generate_input_checksums(workspace_fp, fsuffixes_to_include=None):
result = {}

for base_fp, _, fnames in os.walk(workspace_fp):
for curr_fname in fnames:
if curr_fname.startswith("."):
continue
elif fsuffixes_to_include:
matches = [x for x in fsuffixes_to_include if x in curr_fname]
if len(matches) == 0:
continue

fpath = os.path.join(base_fp, curr_fname)
with open(fpath, "rb") as f:
filebytes = f.read()
filehash = md5(filebytes)
filehash_hex = filehash.hexdigest()
if curr_fname in result:
raise ValueError(f"file name {curr_fname} found >1 time")
result[curr_fname] = filehash_hex

return result


def generate_checksums_file(args_list):
input_dir = args_list[1]
output_fp = args_list[2]
fnames_to_include = None if len(args_list) < 4 else args_list[3:]

fname_and_checksum_dict = generate_input_checksums(
input_dir, fnames_to_include)

output_lines = ["file_name,md5_hex_checksum\n"]
output_entries = [f"{k},{fname_and_checksum_dict[k]}\n" for k
in sorted(fname_and_checksum_dict)]
output_lines.extend(output_entries)

with open(output_fp, "w") as f:
f.writelines(output_lines)


if __name__ == '__main__':
generate_checksums_file(argv)
18 changes: 9 additions & 9 deletions pipeline/lineages.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ INSPECT_METADATA_PATTERN="all_samples_search_ids_"
THREADS=8
rm -rf $WORKSPACE
mkdir -p $WORKSPACE
mkdir -p $WORKSPACE/passQC
# mkdir -p $WORKSPACE/loose_stringent
mkdir -p $WORKSPACE/stringent

# Based on inputs, decide what to download from where
Expand Down Expand Up @@ -83,18 +81,24 @@ runLineages () {

# add the ref and historic fast to the passing fas from all the sequencing runs
cat $WORKSPACE/*passQC.fas >> $WORKSPACE/"$PROCESSINGID"_passQC.fas
cat $WORKSPACE/"$PROCESSINGID"_passQC.fas $WORKSPACE/"$PROCESSINGID"_refs_hist.fas >> $WORKSPACE/passQC/"$PROCESSINGID"_passQC_refs_hist.fas
cat $WORKSPACE/"$PROCESSINGID"_passQC.fas $WORKSPACE/"$PROCESSINGID"_refs_hist.fas >> $WORKSPACE/"$PROCESSINGID"_passQC_refs_hist.fas

# pangolin
source $ANACONDADIR/activate pangolin
pangolin --update
pangolin -t $THREADS --analysis-mode fast --outfile $WORKSPACE/"$PROCESSINGID".lineage_report.csv $WORKSPACE/passQC/"$PROCESSINGID"_passQC_refs_hist.fas
pangolin -t $THREADS --analysis-mode fast --outfile $WORKSPACE/"$PROCESSINGID".lineage_report.csv $WORKSPACE/"$PROCESSINGID"_passQC_refs_hist.fas
echo -e "pangolin exit code: $?" >> $WORKSPACE/"$PROCESSINGID"-lineages.exit.log

# deactivate the pangolin environment and re-activate the main pipeline environment
source $ANACONDADIR/deactivate pangolin
source $ANACONDADIR/activate covid1.2

# generate file of input checksums, for record-keeping
python $PIPELINEDIR/pipeline/document_file_checksums.py \
$WORKSPACE $WORKSPACE/"$PROCESSINGID"_input_checksums.csv \
"-passQC.fas" "-summary.csv"
echo -e "document_file_checksums.py exit code: $?" >> $WORKSPACE/"$PROCESSINGID"-lineages.exit.log

# produce merged qc_and_lineages.csv
python $PIPELINEDIR/qc/lineages_summary.py $WORKSPACE/added_fa_names.txt $WORKSPACE "-summary.csv" $WORKSPACE/"$PROCESSINGID".lineage_report.csv $WORKSPACE/"$PROCESSINGID".qc_and_lineages.csv
echo -e "lineages_summary.py exit code: $?" >> $WORKSPACE/"$PROCESSINGID"-lineages.exit.log
Expand Down Expand Up @@ -122,22 +126,18 @@ runLineages () {
$WORKSPACE/$INSPECT_METADATA_FNAME \
$WORKSPACE/"$PROCESSINGID"_passQC.fas \
$WORKSPACE/stringent/"$PROCESSINGID"_stringent_only.fas \
$WORKSPACE/passQC/"$PROCESSINGID"_passQC_refs_hist_empress_metadata.tsv \
$WORKSPACE/stringent/"$PROCESSINGID"_stringent_refs_hist_empress_metadata.tsv

echo -e "tree_prep.py exit code: $?" >> $WORKSPACE/"$PROCESSINGID"-lineages.exit.log

# add the refs_hist.fas to the stringent_only.fas
cat $WORKSPACE/"$PROCESSINGID"_refs_hist.fas $WORKSPACE/stringent/"$PROCESSINGID"_stringent_only.fas >> $WORKSPACE/stringent/"$PROCESSINGID"_stringent_refs_hist.fas

# add the loose_only.fas to the stringent_refs_hist.fas
# cat $WORKSPACE/stringent/"$PROCESSINGID"_stringent_refs_hist.fas $WORKSPACE/loose_stringent/"$PROCESSINGID"_loose_only.fas >> $WORKSPACE/loose_stringent/"$PROCESSINGID"_loose_stringent_refs_hist.fas

aws s3 cp $WORKSPACE/"$PROCESSINGID".full_summary.csv $S3INSPECT/"$PROCESSINGID".full_summary.csv
}

{ time ( runLineages ) ; } > $WORKSPACE/"$PROCESSINGID"-lineages.log 2>&1
aws s3 cp $WORKSPACE/"$PROCESSINGID"-lineages.log $S3UPLOAD/phylogeny/$PROCESSINGID/\

grep -v "exit code: 0" $WORKSPACE/"$PROCESSINGID"-lineages.exit.log | head -n 1 >> $WORKSPACE/"$PROCESSINGID"-lineages.error.log
aws s3 cp $WORKSPACE/ $S3UPLOAD/phylogeny/$PROCESSINGID/ --recursive --quiet
aws s3 cp $WORKSPACE/ $S3UPLOAD/phylogeny/$PROCESSINGID/ --recursive --quiet --include "*" --exclude "*.fas" --exclude "*-summary.csv" --include "*_passQC_refs_hist.fas" --include "*_stringent_refs_hist.fas"
21 changes: 13 additions & 8 deletions pipeline/trim_msa.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
Trim the ends of a multiple sequence alignment (FASTA)
'''
import argparse
from sys import stdin,stdout
from sys import stdin, stdout


# parse arguments
def parseArgs():
Expand All @@ -16,22 +17,26 @@ def parseArgs():
args = parser.parse_args()
return args.input, args.output, args.start, args.end


# main code execution
infile, outfile, start, end = parseArgs()
assert start >= 0, "Length to trim from start must be non-negative integer"
assert end >= 0, "Length to trim from end must be non-negative integer"
curr_ID = None; curr_seq = None
curr_ID = None
curr_seq = None
for line in infile:
l = line.strip()
if len(l) == 0:
stripped_line = line.strip()
if len(stripped_line) == 0:
continue
if l[0] == '>':
if stripped_line[0] == '>':
if curr_ID is not None:
outfile.write("%s\n%s\n" % (curr_ID, curr_seq[start:-end]))
curr_ID = l; curr_seq = ''
curr_ID = stripped_line
curr_seq = ''
else:
assert curr_seq is not None, "Invalid input FASTA"
curr_seq += l
curr_seq += stripped_line
if curr_ID is not None:
outfile.write("%s\n%s\n" % (curr_ID, curr_seq[start:-end]))
infile.close(); outfile.close()
infile.close()
outfile.close()
54 changes: 12 additions & 42 deletions qc/custom_gen_stats_multiqc.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,27 +115,7 @@ def gather_p25_ins_sizes(qmap_file_list_fp):
return data_dict


def insert_pct_gte_q30_in_sample_dict(data_dict, name, pctQ30):
temp_sample_dict = data_dict.get(name, dict())
if pctQ30 != NA_VAL:
pctQ30 = round(pctQ30, 3)
temp_sample_dict[PCT_30_KEY] = pctQ30
return temp_sample_dict


def insert_uncapped_reads_in_sample_dict(data_dict, name, uncappedReads):
temp_sample_dict = data_dict.get(name, dict())
temp_sample_dict[UNCAPPED_READS_KEY] = uncappedReads
return temp_sample_dict


def insert_sub_map_pct_in_sample_dict(data_dict, name, pctSubMap):
temp_sample_dict = data_dict.get(name, dict())
temp_sample_dict[SUBSAMPLED_MAPPED_PCT_ALIGNED_KEY] = pctSubMap
return temp_sample_dict


def generate_q30_based_values(input_dict, R1, S1, R2=None, S2=None):
def insert_q30_based_values(input_dict, R1, S1, R2=None, S2=None):
name = get_sequenced_pool_component_id(R1)

if R2 is not None:
Expand All @@ -156,12 +136,13 @@ def generate_q30_based_values(input_dict, R1, S1, R2=None, S2=None):
print(f"Warning: Unable to calculate values from q30 file due "
f"to division by zero; reporting as {NA_VAL}")

temp_pctQ30_dict = insert_pct_gte_q30_in_sample_dict(
input_dict, name, pctQ30)
temp_uncapped_reads_dict = insert_uncapped_reads_in_sample_dict(
input_dict, name, uncapped_reads)
temp_sample_dict = input_dict.get(name, dict())

return name, temp_pctQ30_dict, temp_uncapped_reads_dict
if pctQ30 != NA_VAL:
pctQ30 = round(pctQ30, 3)
temp_sample_dict[PCT_30_KEY] = pctQ30
temp_sample_dict[UNCAPPED_READS_KEY] = uncapped_reads
input_dict[name] = temp_sample_dict


def gather_pct_gte_q30(q30_file_list_fp, se_or_pe, data_dict):
Expand All @@ -172,23 +153,13 @@ def gather_pct_gte_q30(q30_file_list_fp, se_or_pe, data_dict):
if se_or_pe == SE_VALUE:
for R1 in q30s_paths:
S1 = parseSeqQual(R1)

name, temp_pctQ30_dict, temp_uncapped_reads_dict = \
generate_q30_based_values(data_dict, R1, S1)

data_dict[name] = temp_pctQ30_dict
data_dict[name] = temp_uncapped_reads_dict
insert_q30_based_values(data_dict, R1, S1)
elif se_or_pe == PE_VALUE:
# Get Q30s for both R1 and R2
for R1, R2 in pairwise(q30s_paths):
S1 = parseSeqQual(R1)
S2 = parseSeqQual(R2)

name, temp_pctQ30_dict, temp_uncapped_reads_dict = \
generate_q30_based_values(data_dict, R1, S1, R2, S2)

data_dict[name] = temp_pctQ30_dict
data_dict[name] = temp_uncapped_reads_dict
insert_q30_based_values(data_dict, R1, S1, R2, S2)
else:
print(f"Warning: Unable to run generate percent greater than q30 and "
f"number of uncapped reads for input type '{se_or_pe}'")
Expand Down Expand Up @@ -216,10 +187,9 @@ def gather_sub_map_pct(sub_map_stats_file_list_fp, data_dict):
reads = parseSubMapStats(sample) # format: [Mapped, Unmapped]
pctAligned = _calc_pct_aligned(reads)

temp_pctAligned_dict = insert_sub_map_pct_in_sample_dict(
data_dict, name, pctAligned)

data_dict[name] = temp_pctAligned_dict
temp_sample_dict = data_dict.get(name, dict())
temp_sample_dict[SUBSAMPLED_MAPPED_PCT_ALIGNED_KEY] = pctAligned
data_dict[name] = temp_sample_dict

return data_dict

Expand Down
9 changes: 7 additions & 2 deletions qc/metadata_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
SUBMIT_TO_GISAID = "submit_to_gisaid"
INDEX_COL_NAME = "Unnamed: 0"
SUBJ_AGE = "subject_age"
METADATA_CLEARED = "metadata_cleared"

BJORN_COL_NAMES = ["Sample ID", "SEARCH SampleID", "Ready for release?",
"New sequences ready for release", "Released?",
Expand All @@ -29,7 +30,7 @@
"Sequenced Pool Component Id", "Variant File S3 URL",
"Consensus File S3 URL", "BAM File S3 URL",
"Source", "Sequencing Run", "Overall Fail",
"Inspect Submit-to-GISAID"]
"Inspect Submit-to-GISAID", METADATA_CLEARED]


def filter_metadata_for_bjorn(full_df):
Expand Down Expand Up @@ -72,11 +73,14 @@ def generate_bjorn_df(filtered_df):
# The metadata reads in as strings (hence 'True') while the qc and lineage
# data reads in parsed (hence False, not 'False')
inspect_approval = filtered_df[SUBMIT_TO_GISAID] == 'True' # noqa 712
# NB: if sample's metadata_cleared is NA, sample IS allowed for release.
# This is because we have no knowledge about whether that metadata is bad.
metadata_bad = filtered_df[METADATA_CLEARED] == 'False' # noqa 712
# NB: this check is false for overall_fail values of None, as it should be:
# don't release items with overall_fail == None since they are unknown
no_overall_fail = filtered_df[OVERALL_FAIL] == False # noqa 712

release_mask = inspect_approval & no_overall_fail
release_mask = inspect_approval & no_overall_fail & ~metadata_bad
output_df.loc[:, "ready_for_release"] = "No"
output_df.loc[release_mask, "ready_for_release"] = "Yes"

Expand Down Expand Up @@ -136,6 +140,7 @@ def generate_bjorn_df(filtered_df):
output_df.loc[:, SEQ_RUN] = filtered_df[SEQ_RUN]
output_df.loc[:, OVERALL_FAIL] = filtered_df[OVERALL_FAIL]
output_df.loc[:, SUBMIT_TO_GISAID] = filtered_df[SUBMIT_TO_GISAID]
output_df.loc[:, METADATA_CLEARED] = filtered_df[METADATA_CLEARED]

return output_df

Expand Down
8 changes: 7 additions & 1 deletion qc/qc_summary.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ runQC () {
echo -e "subset_csv.py exit code: $?" >> $WORKSPACE/"$SEQ_RUN"-qc.exit.log
cat $PASSING_CONS_FNAMES > $WORKSPACE/"$SEQ_RUN"-passQC.fas

# generate file of summary and passQC fas checksums, for record-keeping
python $PIPELINEDIR/pipeline/document_file_checksums.py \
$WORKSPACE $WORKSPACE/"$SEQ_RUN"_artifact_checksums.csv \
"$SEQ_RUN-summary.csv" "$SEQ_RUN-passQC.fas"
echo -e "document_file_checksums.py exit code: $?" >> $WORKSPACE/"$SEQ_RUN"-qc.exit.log

# Exit codes
echo "Gathering per-sample exit codes."
cat $WORKSPACE/*/*error.log > $WORKSPACE/"$SEQ_RUN".error.log
Expand All @@ -109,7 +115,7 @@ runQC () {
# cumulative data folder
S3CUMULATIVE=$S3DOWNLOAD
aws s3 cp $WORKSPACE/"$SEQ_RUN"-passQC.fas $S3CUMULATIVE/phylogeny/cumulative_data/consensus/
aws s3 cp $WORKSPACE/"$SEQ_RUN".fas $S3CUMULATIVE/phylogeny/cumulative_data/consensus/
#aws s3 cp $WORKSPACE/"$SEQ_RUN".fas $S3CUMULATIVE/phylogeny/cumulative_data/consensus/
aws s3 cp $WORKSPACE/"$SEQ_RUN"-summary.csv $S3CUMULATIVE/phylogeny/cumulative_data/consensus/
}

Expand Down
10 changes: 6 additions & 4 deletions qc/tree_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ def prep_files_for_tree_building(arg_list):
metadata_fp = arg_list[2] # May be None
full_fasta_fp = arg_list[3]
out_stringent_fasta_fp = arg_list[4]
out_empress_fp = arg_list[5]
out_stringent_empress_fp = arg_list[6]
out_stringent_empress_fp = arg_list[5]
# optional 7th argument for passQC (not just stringent) empress metadata
out_empress_fp = None if len(arg_list) <= 6 else arg_list[6]

qc_and_lineage_df = pd.read_csv(qc_and_lineage_fp) # , dtype=str)

Expand All @@ -75,8 +76,9 @@ def prep_files_for_tree_building(arg_list):
raw_empress_df = qc_and_lineage_df.copy()

base_empress_df = generate_base_empress_df(raw_empress_df)
# NB that empress metadata files must be tsv
base_empress_df.to_csv(out_empress_fp, sep='\t', index=False)
if out_empress_fp:
# NB that empress metadata files must be tsv
base_empress_df.to_csv(out_empress_fp, sep='\t', index=False)

stringent_mask = \
base_empress_df[STRINGENT_TEST_COL] == STRINGENT_INCLUDE_VAL
Expand Down
Loading

0 comments on commit b43fbc7

Please sign in to comment.