-
Notifications
You must be signed in to change notification settings - Fork 0
/
siRNA_predict.py
executable file
·131 lines (104 loc) · 4.92 KB
/
siRNA_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public
License as published by the Free Software Foundation, version 2 of the License (GPLv2).
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
details at http://www.gnu.org/licenses/.
name: siRNA_predict.py
date: Nov-27-2014
version: 1.0
author: Stephen R. Bond
email: steve.bond@nih.gov
institute: Computational and Statistical Genomics Branch, Division of Intramural Research,
National Human Genome Research Institute, National Institutes of Health
Bethesda, MD
repository: https://github.com/biologyguy/public_scripts
© license: Gnu General Public License, Version 2.0 (http://www.gnu.org/licenses/gpl.html)
derivative work: No
Description:
Python implementation of siRNA prediction criteria as determined by Reynolds et al., 2004, Nat Biotechnol 22(3):326-330.
This script will generate equivalent output to the online tool at http://tiny.cc/naus_siRNA_pred. The sense strand cDNA
is provided as input on the command line, either as a string or FASTA formatted file, and the output is a list of scores
for all possible 19-mer sequences, as a preformatted table or in CSV format. For a detailed description of the
parameters the script takes, navigate to the directory containing the program within a terminal window, and run the
following command:
python ./siRNA_predict.py -h
"""
from re import findall, sub
import argparse
from os.path import isfile
parser = argparse.ArgumentParser(prog="siRNA prediction",
description="Implementation of siRNA design algorithm developed by "
"Reynolds et al., 2004, Nat Biotechnol 22(3):326-330",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('sequence', help='Input DNA sequence to analyze (FASTA file or string)')
parser.add_argument('-c', '--csv', help='Output as pure CSV', action="store_true")
in_args = parser.parse_args()
def si_score(sequence):
_score = 0
# Criteria 1: Moderate to low (30%-52%) GC Content -> 1 point
gc_count = len(findall("G", sequence)) + len(findall("C", sequence))
if 6 <= gc_count <= 10:
_score += 1
# Criteria 2: At least 3 A/Us at positions 15-19 (sense) -> 1 point per (A/U)
terminal_at_count = len(findall("A", sequence[14:])) + len(findall("T", sequence[14:]))
_score += terminal_at_count
# Criteria 3: Contains stretch of 4 or more bases, such as AAAA or CCCC -> -1
if len(findall("AAAA", sequence)) > 0 or len(findall("TTTT", sequence)) > 0 or len(findall("GGGG", sequence)) > 0\
or len(findall("CCCC", sequence)) > 0:
_score -= 1
# Criteria 4: A at position 19 (sense) -> 1 point
if sequence[18] == "A":
_score += 1
# Criteria 5: A at position 3 (sense) -> 1 point
if sequence[2] == "A":
_score += 1
# Criteria 6: T at position 10 (sense) -> 1 point
if sequence[9] == "T":
_score += 1
# Criteria 7: G/C at position 19 (sense) -> -1 point
if len(findall("[GC]", sequence[18])) > 0:
_score -= 1
# Criteria 8: G at position 13 (sense) -> -1 point
if sequence[12] == "G":
_score -= 1
return _score
if isfile(in_args.sequence):
with open(in_args.sequence, "r") as ifile:
full_seq = ifile.read()
full_seq = sub(">.*", "", full_seq).upper()
full_seq = sub("\s[0-9]", "", full_seq)
full_seq = sub("U", "T", full_seq)
full_seq = sub("[^ATCG]", "X", full_seq)
else:
full_seq = in_args.sequence.upper()
full_seq = sub("U", "T", full_seq)
full_seq = sub("[^ATCG]", "X", full_seq)
si_seqs_list = [[], [], [], [], [], [], [], [], [], []]
for i in range(len(full_seq) - 18):
seq = full_seq[i:i + 19]
score = si_score(seq)
score = score if score >= 0 else 0
si_seqs_list[score].append((seq, i + 1))
biggest_column = 0
for si_seq in si_seqs_list:
if len(si_seq) > biggest_column:
biggest_column = len(si_seq)
# super clunky text formating... But looks good in terminal
output = "Score:\t\t9\t\t\t\t8\t\t\t\t7\t\t\t\t6\t\t\t\t5\t\t\t\t4\t\t\t\t3\t\t\t\t2\t\t\t\t1\t\t\t\t0\n"
for row in range(biggest_column):
for index in [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]:
if row < len(si_seqs_list[index]):
output += "%s— %s\t" % (str(si_seqs_list[index][row][1]).ljust(5), si_seqs_list[index][row][0])
else:
output += "\t\t\t\t"
output += "\n"
if in_args.csv:
output = sub("\t{4}", ",,", output)
output = sub("\t{2}", ",", output)
output = sub("\t", ",", output)
output = sub("—", ",", output)
output = sub(" ", "", output)
print(output)