-
Notifications
You must be signed in to change notification settings - Fork 0
/
withfunctions.py
150 lines (124 loc) · 4.71 KB
/
withfunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import csv
from optparse import OptionParser
from string import maketrans
#options
parser = OptionParser()
parser.add_option('-a', '--inputfile',
dest='filename',
help='The file to be searched (.csv)')
parser.add_option('-o', '--outputfile',
dest='newfilename',
help='The file to be written to (.csv)')
parser.add_option('-k', '--casesensitive',
action='store_true', dest='casesensitive', default=False,
help='If the search should be case sensitive')
parser.add_option('-d', '--hammingdistance',
type='int', dest='hammingdistance', default=1,
help='The desired hamming distance for the search')
parser.add_option('-s', '--substring',
dest='substring',
help='The substring that is being searched for')
parser.add_option('-c', '--complement',
action='store_true', dest='complement', default=False,
help='If the search should be for the search query\'s complement')
parser.add_option('-r', '--rna',
action='store_true', dest='rna', default=False,
help='If the string to be searched is an RNA strand, and the search is for the complement')
(options, args) = parser.parse_args()
#setting options variables
file_path = options.filename
new_file_path = options.newfilename
case_sensitive = options.casesensitive
hd = options.hammingdistance
sub_string = options.substring
complement = options.complement
rna = options.rna
#complement function
def create_DNA_complement(string_segment):
if rna:
pairs = {'A':'U', 'U':'A', 'G':'C', 'C':'G'}
else:
complement = ''
pairs = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
for b in string_segment:
b = pairs[b]
complement += b
print complement
return complement
#hamming distance function
def hamming_distance(sub_string, string_segment):
distance = 0
if len(sub_string) == len(string_segment):
for i in range(len(sub_string)):
if not sub_string[i] == string_segment[i]:
distance += 1
return distance
#search function to return a dictionary with all instances found
def find_all_instances(file_path, sub_string, hd, case_sensitve):
hd = int(hd)
instances_dict = {}
#takes in text to search
strings_for_searching = {}
with open(file_path, 'r+') as searchfile:
text = list(searchfile)
for line in text:
#title of each string
splitUp = line.strip().split(',')
#print splitUp
strings_for_searching[splitUp[0]] = splitUp[1]
#print strings_for_searching
#searches each row and adds results to the instances_dict dictionary
#variable for attaching the right title to the right search results
for name in strings_for_searching:
string = strings_for_searching[name]
if case_sensitive:
string = string.lower()
sub_string = sub_string.lower()
for index in range(len(string)):
string_segment = string[index:(index+len(sub_string))]
if complement:
string_segment = create_DNA_complement(string_segment)
if len(string_segment) == len(sub_string):
if hamming_distance(sub_string, string_segment) <= hd:
if not name in instances_dict:
instances_dict[name] = []
instances_dict[name].append(str(index))
#print instance_dict
return instances_dict
#creates a usable output from the function (for writing to a new csv file)
found = find_all_instances(file_path, sub_string, hd, case_sensitive)
#write to file
with open(new_file_path, 'ab') as csvfile:
csvfile.write('sequence,location(s)\n')
for key in found:
entry = str(key) + ',' + ';'.join([str(i) for i in found[key]]) + '\n'
csvfile.write(entry)
#----------------------------------------tests--------------------------------------------------------------------------
"""import unittest
class hamming_distance_tests(unittest.TestCase):
def test0(self):
self.assertEqual(hamming_distance('acgt', 'acgt'), 0)
def test1(self):
self.assertEqual(hamming_distance('acgt','tcgt'), 1)
def test2(self):
self.assertEqual(hamming_distance('acgt','tggt'), 2)
def test3(self):
self.assertEqual(hamming_distance('acgt','tgct'), 3)
def test4(self):
self.assertEqual(hamming_distance('acgt','tgca'), 4)
class instances_dict_tests(unittest.TestCase):
""""""relies on searching a specific file for "GGGG". This is the text of that specific csv file:
string1,AAAAAAAAAAAAAGGGAAAAAAAAAAAAAAAAAAGGGAAA
string2,GGGGAAAAAAAGGGGAAAA
string3,AAAAAAAAAAAAAAGGGGAAAAAAAAAAAAAAAAAAGGGGAAAA""""""
def teststring1(self):
self.assertEqual(found['string1'], ['index 12', 'index 13', 'index 33', 'index 34'])
def teststring2(self):
self.assertEqual(found['string2'], ['index 0', 'index 1', 'index 10', 'index 11', 'index 12'])
def teststring3(self):
self.assertEqual(found['string3'], ['index 13', 'index 14', 'index 15', 'index 35', 'index 36', 'index 37'])
def main():
unittest.main()
if __name__ == '__main__':
main()
"""