-
Notifications
You must be signed in to change notification settings - Fork 0
/
bjorn_support.py
276 lines (234 loc) · 10 KB
/
bjorn_support.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
import sys
import glob
import subprocess
import gzip
from path import Path
import numpy as np
import pandas as pd
from Bio import Seq, SeqIO, AlignIO, Phylo, Align
def integrate_gisaid_meta(old_meta_fp, xtra_fp, msa_fp, rename_cols, drop_cols):
xtra = pd.read_csv(xtra_fp, sep='\t')
xtra.rename(columns=rename_cols, inplace=True)
# xtra.columns
xtra['country'] = xtra['Location'].apply(lambda x: x.split('/')[1].strip())
xtra['division'] = xtra['Location'].apply(lambda x: x.split('/')[2].strip())
xtra['location'] = xtra['Location'].apply(clean_locs)
xtra['strain'] = xtra['strain'].apply(lambda x: '/'.join(x.split('/')[1:]))
sois = xtra['strain'].unique().tolist()
sois = dict.fromkeys(sois)
meta = pd.read_csv(old_meta_fp, sep='\t', compression='gzip')
# meta.columns
meta = meta.loc[~((meta['country'].str.contains('USA'))&(meta['pangolin_lineage']=='B.1.1.7'))]
meta = pd.concat([meta, xtra])
meta.drop(columns=drop_cols, inplace=True)
meta.loc[meta['country']=='USA', 'country'] = 'United States of America'
return meta
def clean_locs(x):
if len(x.split('/')) > 3:
return x.split('/')[3].strip()
return 'unk'
def fetch_seqs(seqs_filepath, out_fp, sample_idxs: list, is_aligned=False, is_gzip=False):
if is_aligned:
if is_gzip:
with gzip.open(seqs_filepath, "rt") as handle:
cns = AlignIO.read(handle, 'fasta')
else:
cns = AlignIO.read(seqs_filepath, 'fasta')
my_cns = Align.MultipleSeqAlignment([rec for rec in cns if rec.id in sample_idxs])
return AlignIO.write(my_cns, out_fp, 'fasta')
else:
if is_gzip:
with gzip.open(seqs_filepath, "rt") as handle:
cns = SeqIO.parse(handle, 'fasta')
else:
cns = SeqIO.parse(seqs_filepath, 'fasta')
my_cns = [rec for rec in cns if rec.id in sample_idxs]
return SeqIO.write(my_cns, out_fp, 'fasta')
def get_filepaths(analysis_path: str, data_fmt: str, sample_ids: list=[],
data_type: str='consensus', tech: str='illumina') -> dict:
"""Take list of sample IDs, the general area where your files are located, and their format.
Returns a dictionary with sample IDs as keys and the filepaths as values.
"""
file_paths = {}
if sample_ids:
for s_id in sample_ids:
f = glob.glob(f"{analysis_path}/*{data_type}*/*{tech}*/*{s_id}*.{data_fmt}")
try:
file_paths[s_id] = f[0]
except:
continue
else:
fs = glob.glob(f"{analysis_path}/*{data_type}*/*{tech}*/*.{data_fmt}")
for f in fs:
try:
sample_id = f.split('/')[-1].split('_')[0].split('-')[1]
except:
continue
file_paths[sample_id] = f
return file_paths
def get_variant_filepaths(sample_ids: list, analysis_path: str='/home/gk/analysis') -> dict:
"""Takes list of sample IDs and returns filepaths of variant data for corresponding samples"""
variant_paths = {}
for s_id in sample_ids:
f = glob.glob(f"{analysis_path}/**/variants/illumina/*{s_id}*.tsv")
variant_paths[s_id] = f[0]
return variant_paths
def find_loc(f: str):
"""helper function to fetch location from filepath"""
return f.split('/')[-1].split('_')[0].split('-')[-1]
def get_variant_data(variant_filepaths: dict):
"""Takes dict of variant filepaths and loads all variant data into dataframe"""
df = (pd.concat((pd.read_csv(f, sep='\t')
.assign(sample=s_id, location=find_loc(f)) for s_id, f in variant_filepaths.items())))
return df
def load_fasta(fasta_filepath, is_gzip=False, is_aligned=False):
if is_gzip:
with gzip.open(fasta_filepath, "rt") as handle:
if is_aligned:
cns = AlignIO.read(handle, 'fasta')
else:
cns = SeqIO.parse(handle, 'fasta')
else:
if is_aligned:
cns = AlignIO.read(fasta_filepath, 'fasta')
else:
cns = SeqIO.parse(fasta_filepath, 'fasta')
return cns
def run_datafunk(in_filepath, ref_path, out_filepath):
df_cmd = f"datafunk sam_2_fasta -s {in_filepath} -r {ref_path} -o {out_filepath} --pad --log-inserts"
run_command(df_cmd)
return out_filepath
def run_minimap2(in_filepath, out_filepath, ref_path, num_cpus=25):
map_cmd = f"minimap2 -a -x asm5 -t {num_cpus} {ref_path} {in_filepath} -o {out_filepath}"
run_command(map_cmd)
return out_filepath
def concat_fasta_2(in_filepaths: list, out_filepath):
"""Concatenate fasta sequences into single fasta file.
Takes a list of fasta filepaths and an output filename for saving"""
cat_cmd = f"cat {' '.join(in_filepaths)} > {out_filepath}"
run_command(cat_cmd)
return out_filepath
def sample_fasta(fasta_filepath, out_filepath, sample_size=100):
"Sample the first n sequences from the input fasta file"
sampled_seqs = []
seqs = load_fasta(fasta_filepath)
for i, rec in enumerate(seqs):
if i < sample_size:
sampled_seqs.append(rec)
else:
SeqIO.write(sampled_seqs, out_filepath, 'fasta');
return out_filepath
def concat_fasta(in_dir, out_dir):
"""Concatenate fasta sequences into single fasta file"""
cat_cmd = f"cat {in_dir}/*.fa* > {out_dir}.fa"
subprocess.check_call(cat_cmd, shell=True)
return f"{out_dir}.fa"
def run_command(cmd):
"helper function used to run bash commands"
# run it
p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
# But do not wait till netstat finish, start displaying output immediately
while True:
out = p.stderr.read(1)
if out == b'' and p.poll() != None:
break
if out != b'':
sys.stdout.write(out.decode('utf-8'))
sys.stdout.flush()
def align_fasta(fasta_filepath, out_filepath, num_cpus=8):
"""Generate Multiple Sequence Alignment of concatenated sequences in input fasta file using mafft.
TODO: ALLOW USER TO INPUT CUSTOM COMMAND"""
msa_cmd = f"mafft --auto --thread {num_cpus} {fasta_filepath} > {out_filepath}"
subprocess.check_call(msa_cmd, shell=True)
return out_filepath
def align_fasta_reference(fasta_filepath, out_filepath, ref_fp: str, num_cpus=8):
"""Generate Multiple Sequence Alignment of concatenated sequences in input fasta file using mafft"""
msa_cmd = f"mafft --auto --thread {num_cpus} --keeplength --addfragments {fasta_filepath} {ref_fp} > {out_filepath}"
run_command(msa_cmd)
return out_filepath
def compute_tree(msa_filepath, num_cpus=8, redo=False):
"""Compute ML tree of aligned sequences in input fasta using iqtree"""
out_filepath = msa_filepath + '.treefile'
if redo:
tree_cmd = f"iqtree -redo -s {msa_filepath} -nt {num_cpus} -m HKY -czb -fast"
else:
tree_cmd = f"iqtree -s {msa_filepath} -nt {num_cpus} -m HKY -czb -fast"
run_command(tree_cmd)
return out_filepath
def compute_time_tree(msa_filepath, tree_filepath, num_cpus=8):
"""Compute time tree (???)"""
out_path = '/'.join(msa_filepath.split('/')[:-1]) + '/timetree'
tree_cmd = f"treetime ancestral --aln {msa_filepath} --tree {tree_filepath} --outdir {out_path}"
subprocess.check_call(tree_cmd, shell=True)
return out_path
def add_gene_column(df: pd.DataFrame) -> pd.DataFrame:
"""Takes dataframe containing intra-host variant information and adds a column indicating the gene for each mutation
Uses the GFF_FEATURE to infer gene, and when GFF_FEATURE is missing (i.e. for indels), it uses nucleotide position
to infer gene. The `gff2gene` mapping was obtained from the file /home/al/data/hcov19/gff/NC_045512.2.gff3 and using
the create_gff2gene_mapping() function defined below."""
gff2gene = {
'cds-YP_009724389.1': 'ORF1ab',
'cds-YP_009725295.1': 'ORF1ab',
'cds-YP_009724390.1': 'S',
'cds-YP_009724396.1': 'ORF8',
'cds-YP_009724397.2': 'N',
'cds-YP_009724395.1': 'ORF7a',
'cds-YP_009724391.1': 'ORF3a',
'cds-YP_009724393.1': 'M',
'cds-YP_009724394.1': 'ORF6',
'cds-YP_009725255.1': 'ORF10',
'cds-YP_009725318.1': 'ORF7b',
'cds-YP_009724392.1': 'E'
}
# infer the gene from GFF_FEATURE
try:
df['gene'] = df['GFF_FEATURE'].apply(lambda x: gff2gene.get(x, 'nan'))
except:
raise KeyError('GFF_FEATURE column not found in the input dataframe.')
# infer the gene from position when GFF_FEATURE is missing
try:
df.loc[df['gene']=='nan', 'gene'] = df.loc[df['gene']=='nan', 'POS'].apply(map_gene_to_pos)
except:
raise KeyError('POS column not found in the input dataframe.')
return df
def map_gene_to_pos(x):
"""helper function to infer the gene based on nucleotide position of SARS-CoV-2 genome"""
pos = x
if pos <= 265:
return '5UTR'
elif pos > 265 and pos <= 13466:
return 'ORF1a'
elif pos > 13466 and pos <= 21555:
return 'ORF1b'
elif pos > 21562 and pos <= 25384:
return 'S'
elif pos > 25392 and pos <= 26220:
return 'ORF3a'
elif pos > 26244 and pos <= 26472:
return 'E'
elif pos > 26522 and pos <= 27191:
return 'M'
elif pos > 27201 and pos <= 27387:
return 'ORF6'
elif pos > 27393 and pos <= 27759:
return 'ORF7a'
elif pos > 27755 and pos <= 27887:
return 'ORF7b'
elif pos > 27893 and pos <= 28259:
return 'ORF8'
elif pos > 28273 and pos <= 29533:
return 'N'
elif pos > 29557 and pos <= 29674:
return 'ORF10'
elif pos > 29674:
return '3UTR'
return 'nan'
def create_gff2gene_mapping(variant_data: pd.DataFrame, gff_filepath: str) -> dict:
fn = gffutils.example_filename(gff_filepath)
db = gffutils.create_db(fn, dbfn='test.db', merge_strategy='merge', force=True)
all_gffs = variant_data['GFF_FEATURE'].dropna().unique()
gff2gene = {}
for gff in all_gffs:
if gff:
gff2gene[gff] = db[gff]['gene'][0]
return gff2gene