-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_variants.py
91 lines (80 loc) · 3.61 KB
/
count_variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import time
import json
from path import Path
import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br
import data as bd
with open('config.json', 'r') as f:
config = json.load(f)
out_dir = Path(config['out_dir'])
date = config['date']
gisaid_fasta = config['gisaid_fasta']
gisaid_meta = config['gisaid_meta']
ref_fasta = config['ref_fasta']
# output filenames
fasta_filepath = out_dir/f'sequences_{date}.fasta'
sam_filepath = out_dir/f'sequences_{date}.sam'
alignment_filepath = out_dir/f'sequences_{date}_aligned.fasta'
subs_fp = out_dir/f'subs_long_{date}.csv.gz'
dels_fp = out_dir/f'dels_long_{date}.csv.gz'
# extra configs
num_cpus = int(config['num_cpus'])
is_gzip = config['is_gzip']
is_test = config['is_test']
test_fasta = out_dir/'test.fasta'
test_size = 100
if is_test:
gisaid_fasta = bs.sample_fasta(gisaid_fasta, test_fasta, sample_size=test_size)
print(f"STEP 1: Aligning sequences...")
fasta_filepath = bs.concat_fasta_2([gisaid_fasta, ref_fasta], fasta_filepath)
print(f"Reference sequence added to input sequences and saved at {fasta_filepath}")
t0 = time.time()
sam_filepath = bs.run_minimap2(fasta_filepath, sam_filepath, ref_fasta, num_cpus=num_cpus)
minimap_time = time.time() - t0
print(f"SAM data generated from consensus sequences and saved at {sam_filepath}")
t0 = time.time()
alignment_filepath = bs.run_datafunk(sam_filepath, ref_fasta, alignment_filepath)
datafunk_time = time.time() - t0
print(f"Alignment generated and saved at {alignment_filepath} \n")
print(f"STEP 2: Counting variants...")
print(f"Loading alignment file at {alignment_filepath}")
t0 = time.time()
msa_data = bs.load_fasta(alignment_filepath, is_aligned=True, is_gzip=is_gzip)
msa_load_time = time.time() - t0
print(f"Identifying substitution-based mutations...")
t0 = time.time()
subs, _ = bm.identify_replacements_per_sample(msa_data,
gisaid_meta,
bd.GENE2POS,
data_src='gisaid',
test=is_test)
subs_time = time.time() - t0
subs.to_csv(subs_fp, index=False, compression='gzip')
print(f"Identifying deletion-based mutations...")
t0 = time.time()
dels, _ = bm.identify_deletions_per_sample(msa_data,
gisaid_meta,
bd.GENE2POS,
data_src='gisaid',
min_del_len=1,
test=is_test)
dels_time = time.time() - t0
dels.to_csv(dels_fp, index=False, compression='gzip')
print(f"Substitutions saved in {subs_fp}")
print(f"Deletions saved in {dels_fp}")
total_time = minimap_time + datafunk_time + msa_load_time + subs_time + dels_time
# Data logging
with open(out_dir/'log.txt', 'w') as f:
f.write(f"Total Execution Time: {(total_time / 60):.2f} minutes\n")
f.write(f"minimap2 Execution Time: {(minimap_time / 60):.2f} minutes\n")
f.write(f"datafunk Execution Time: {(datafunk_time / 60):.2f} minutes\n")
f.write(f"Alignment loading Execution Time: {(msa_load_time / 60):.2f} minutes\n")
f.write(f"Counting Substitutions Execution Time: {(subs_time / 60):.2f} minutes\n")
f.write(f"Counting Deletions Execution Time: {(dels_time / 60):.2f} minutes\n")
f.write(f"Alignment loading Execution Time: {(msa_load_time / 60):.2f} minutes\n")
print(f"""END:\tprocess complete.
\n\tlogging information saved at {out_dir/'log.txt'}
\n\tcontact gkarthik@scripps.edu for any issues ;) """)