-
Notifications
You must be signed in to change notification settings - Fork 1
/
simulateReads.py
executable file
·160 lines (138 loc) · 5.82 KB
/
simulateReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#################################
######### SiLiCO v. 1.0 #########
### (c)2016 Ethan A. G. Baker ###
##### ethanagbaker@pitt.edu #####
#################################
from __future__ import division
from natsort import natsorted
import numpy as np
import sys, getopt, math, gzip
from getRandomPosition import getRandomPosition
from findChromosome import findChromosome
def simulateReads(infileName,outfileName,mean,std,desired_cov,trialCount,names,seqMode):
#Initialize variables
y=0
loopController=True
#Open the index file (chrdist.td)
with open("SiLiCO_Scratch/chrdist.td",'r') as infile:
lengths = []
names = []
for x in infile:
length = x.split('\t')[1]
name = x.split('\t')[0]
lengths.append(int(length))
names.append(str(name))
infile.close()
lengthDict = dict(zip(names,lengths))
genomeLength=0
names=natsorted(names) #A sorted list of chromosome names.
#Calculate the total genome length
for n in lengths:
genomeLength += n
chrCount = len(lengths)
thresholdDict={} #A dictionary of chromome thresholds (defined as end of a chromsome as determined by length from chrdist.td)
correctionDict={} #A correction to get the start of the chromosome. Important for generating bed file.
#Generate the threshold dictionary.
print("Building threshold dictionaries...")
for chrom in range(0,len(names)): #might have to change this because it will change chrX to chr24 (build an array of names and use an index)
name = str(names[chrom])#this name will go as key in dict
if name =="chr1":
thresholdDict[name]=lengthDict[name]
else:
threshVal = 0
correctedVal = 0
i=1
while (i<=chrom+1):
name2 = "chr"+str(i)
threshVal += lengthDict[name2]
i += 1
correctedVal = threshVal - lengthDict[name2]
thresholdDict[name2] = threshVal
#Build correction dictionary
for j in range(0,chrCount):
chromName = str(names[j])
if j-1 < 0: #chr1
correctionDict[chromName] = 0
elif j-1 >= 0:
prevChromName = str(names[j-1])
correctionDict[chromName] = thresholdDict[prevChromName]
#Some calculations for the appropriate distribution.
print("Calculating distribution parameters...")
if seqMode == 1: #PacBio/LogNormal
sigma = (math.log(1+(float(mean)/(float(std))**2)))**0.5
mu = math.log(float(mean))-0.5*sigma**2
req_reads = int((int(desired_cov)*genomeLength)/float(mean))
#Begin generating in-silico reads
trial_counter = 0
while trial_counter < int(trialCount):
#print("This is trial " + str(trial_counter))
read_length_counter = 0
read_pos_counter = 0
readlengths = None
readlengths=np.random.lognormal(mu,sigma,req_reads) #Read lengths are randomly determined from the calculated log-normal distribution.
read_pos=[]
name_counter = 0
outfile = gzip.open(str(outfileName) + '/simulated_read_positions_trial_'+str(trial_counter) +'.bed.gz','wb')
for length in readlengths:
while True:
x = int(round(length))
buf = x/2 #protects against end selection bias and simulated read bridging two chromosomes, in the event of a .5, rounds up to the whole
y=getRandomPosition(buf,genomeLength,thresholdDict,names,lengthDict)
#print(str(y))
start_pos = int(y-buf)
end_pos = int(y+buf)
#Figure out which chromosome this is in
selected_chrom = findChromosome(start_pos,names,thresholdDict)
end_pos_chrom = findChromosome(end_pos,names,thresholdDict)
if selected_chrom == end_pos_chrom:
outfile.write(str(selected_chrom) + '\t' + str(start_pos-correctionDict[str(selected_chrom)]) + '\t' + str(end_pos-correctionDict[str(selected_chrom)]) + '\t' + 'trial_'+str(trial_counter) +'_sim_read_' + str(name_counter) + '\n')
break
#count this run and reset reused variables.
name_counter+=1
x=None
y=None
selected_chrom=None
start_pos=None
end_pos=None
outfile.close()
trial_counter+=1
print("Completed trial " + str(trial_counter) + " of " + str(trialCount) + ". ("+ str(100*(float(trial_counter)/trialCount)) + "%)")
elif seqMode == 2: #nanopore/gamma
scale = float(std)/mean
shape = (mean**2)/float(std)
req_reads = int((int(desired_cov)*genomeLength)/float(mean))
#Begin generating in-silico reads
trial_counter = 1
while trial_counter <= int(trialCount):
read_length_counter = 0
read_pos_counter = 0
readlengths = None
readlengths=np.random.gamma(shape,scale,req_reads) #Read lengths are randomly determined from the calculated log-normal distribution.
read_pos=[]
name_counter = 0
outfile = gzip.open(str(outfileName) + '/simulated_read_positions_trial_'+str(trial_counter) +'.bed.gz','wb')
for length in readlengths:
while True:
x = int(round(length))
buf = x/2 #protects against end selection bias and simulated read bridging two chromosomes, in the event of a .5, rounds up to the whole
y=getRandomPosition(buf,genomeLength,thresholdDict,names,lengthDict)
start_pos = int(y-buf)
end_pos = int(y+buf)
#Figure out which chromosome this is in
selected_chrom = findChromosome(start_pos,names,thresholdDict)
end_pos_chrom = findChromosome(end_pos,names,thresholdDict)
if selected_chrom == end_pos_chrom: #Check to ensure that read lengths are not bridging chromosomes
outfile.write(str(selected_chrom) + '\t' + str(start_pos-correctionDict[str(selected_chrom)]) + '\t' + str(end_pos-correctionDict[str(selected_chrom)]) + '\t' + 'trial_'+str(trial_counter) +'_sim_read_' + str(name_counter) + '\n')
break
#count this run and reset reused variables.
name_counter+=1
x=None
y=None
selected_chrom=None
start_pos=None
end_pos=None
outfile.close()
trial_counter+=1
print("Completed trial " + str(trial_counter) + " of " + str(trialCount) + ". ("+ str(100*(float(trial_counter)/trialCount)) + "%)")
if __name__ == "__main__":
simulateReads(infileName,outfileName,mean,std,desired_cov,trials,names,seqMode)