-
Notifications
You must be signed in to change notification settings - Fork 0
/
core_SNPs3.pl
executable file
·157 lines (145 loc) · 5.37 KB
/
core_SNPs3.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/perl
#v3.1
# core_SNPs.pl SNPs_all_labelLoci fileName2genomeName $min_fraction_with_locus
# creates core_SNPs and nonCore_SNPs and SNPs_in_majority.#
# core_SNPs has the SNP loci that are present in all the input genomes
# nonCore_SNPs has the rest of the loci, that are present in a subset of genomes
# SNPs_in_majority.$min_fraction_with_locus has the SNPs present in at least this fraction of genomes: $min_fraction_with_locus
no warnings 'deprecated';
# Count # "ids" in IN - JN
my $snp_file=$ARGV[0];
my $idfile=$ARGV[1];
my $min_fraction_with_locus=$ARGV[2];
open IN, "$idfile";
my $num_ids=0;
while (my $line = <IN>) {
chomp($line);
if ($line =~ /\w+/) {
$num_ids++;
}
}
close IN;
#print "$num_ids\n";
open IN, "$snp_file" or die "Cannot open $snp_file: $!.\n";
my $count=0;
my $locus_count=0;
my %by_id=();
my %snps=();
my $count_core=0;
my $count_noncore=0;
my $count_majority=0;
my $out="SNPs_in_majority".$min_fraction_with_locus;
open CORE,">core_SNPs" or warn "Cannot open core_SNPs for writing: $!\n";
open NONCORE,">nonCore_SNPs" or warn "Cannot open nonCore_SNPs for writing: $!\n";
open MAJORITY,">$out" or warn "Cannot open $out for writing: $!\n";
while (my $line = <IN>) {
chomp($line);
# print "$line\n";
if ($line !~ /\w+/) {
$locus_count++;
$count=0; # Count of what?
if ($locus_count >1) { # Do this only _after_ the locus is read. (what happens to last locus data?) - JN
# print SNP into either core or nonCore file
#
# This tests to see if we have found the last ID / locus (not sure why > portion of test) - JN
my $num_ids_with_locus=scalar keys %snps;
# If this matches, we output "core" and "majority", but only count the "core" part? - JN
if ($num_ids_with_locus >= $num_ids) {
$count_core++;
print CORE "\n";
print MAJORITY "\n";
# Go through snps in sorted order (maybe? What's this $a cmp $b business?) - JN
foreach my $id (sort {$a cmp $b} keys %snps) {
# More $a ... $b stuff? This time going through IDs associated with each snp in
# sorted order, to ... - JN
foreach my $c (sort {$a <=> $b} keys %{$snps{$id}}) {
# Output the item in core + majority into corresponding output files. - JN
print CORE "$snps{$id}{$c}\n";
print MAJORITY "$snps{$id}{$c}\n";
}
}
# This test tells us the item is non-core, and also majority.
} elsif ($num_ids_with_locus >= $num_ids*$min_fraction_with_locus && $num_ids_with_locus < $num_ids) {
$count_noncore++;
$count_majority++;
print MAJORITY "\n";
print NONCORE "\n";
foreach my $id (sort {$a cmp $b} keys %snps) {
foreach my $c (sort {$a <=> $b} keys %{$snps{$id}}) {
print MAJORITY "$snps{$id}{$c}\n";
print NONCORE "$snps{$id}{$c}\n";
}
}
# If neither above test succeeds, this (locus?) is "noncore", and
# the data is only put in the noncore output file - JN
} else {
$count_noncore++;
print NONCORE "\n";
foreach my $id (sort {$a cmp $b} keys %snps) {
foreach my $c (sort {$a <=> $b} keys %{$snps{$id}}) {
print NONCORE "$snps{$id}{$c}\n";
}
}
}
}
# reinitialize %snp after printing
# Reset snp the first time too (but don't do the rest). Note: snps
# is initialized, so this isn't needed for first run either. - JN
%snps=();
# Test to see if line matches <numbers> <blah> <blah> <digits+else?> <nonwhitespace>
# instead of \w+ ... if so, increment count, and ... - JN
} elsif ($line =~ /\d+\t.*\t.*\t(\d+(?:\sF|\sR)?|x)\t(\S+)/) {
my $id=$2;
$count++;
# This pattern appears in at least one other file; code that can
# be generalized out? - JN
# This records the data in $line for every ID and every count...
# A little memory intensive? - JN
$snps{$id}{$count}=$line; # keep by count in case locus occurs multiple positions in a genome
}
}
# Here we repeat the tests above .... Pain to keep in sync. Can we find
# another approach? Looks like it works the same today.- JN
# Add last one
my $num_ids_with_locus=scalar keys %snps;
if ($num_ids_with_locus >= $num_ids) {
$count_core++;
print CORE "\n";
print MAJORITY "\n";
foreach my $id (sort {$a cmp $b} keys %snps) {
foreach my $c (sort {$a <=> $b} keys %{$snps{$id}}) {
print CORE "$snps{$id}{$c}\n";
print MAJORITY "$snps{$id}{$c}\n";
}
}
} elsif ($num_ids_with_locus >= $num_ids*$min_fraction_with_locus && $num_ids_with_locus < $num_ids) {
$count_noncore++;
$count_majority++;
print MAJORITY "\n";
print NONCORE "\n";
foreach my $id (sort {$a cmp $b} keys %snps) {
foreach my $c (sort {$a <=> $b} keys %{$snps{$id}}) {
print MAJORITY "$snps{$id}{$c}\n";
print NONCORE "$snps{$id}{$c}\n";
}
}
} else {
$count_noncore++;
print NONCORE "\n";
foreach my $id (sort {$a cmp $b} keys %snps) {
foreach my $c (sort {$a <=> $b} keys %{$snps{$id}}) {
print NONCORE "$snps{$id}{$c}\n";
}
}
}
# This is where we fix the fact that we weren't incrementing 'majority' in the
# first of each test above. - JN
$count_majority += $count_core;
close IN;
close CORE;
close NONCORE;
open COUNT,">COUNT_coreSNPs" or warn "Cannot open COUNT_coreSNPs: $!.\n";
print COUNT "Number core SNPs: $count_core\n";
print COUNT "Number non-core SNPs: $count_noncore\n";
print COUNT "Number SNPs in at least a fraction $min_fraction_with_locus of genomes: $count_majority\n";
close COUNT or warn "Cannot close COUNT_coreSNPs: $!.\n";