diff --git a/mzLib/Test/TestSpectra.cs b/mzLib/Test/TestSpectra.cs index 8a8b0eb1..450c5038 100644 --- a/mzLib/Test/TestSpectra.cs +++ b/mzLib/Test/TestSpectra.cs @@ -19,9 +19,13 @@ using MassSpectrometry; using MzLibUtil; using NUnit.Framework; +using Proteomics.PSM; +using Readers; using System; using System.Collections.Generic; +using System.IO; using System.Linq; +using System.Text.RegularExpressions; using Stopwatch = System.Diagnostics.Stopwatch; namespace Test @@ -55,6 +59,819 @@ public void Setup() _mzSpectrumA = new MzSpectrum(mz, intensities, false); } + + + [Test] + public static void MoreJunk() + { + List quantResults = File.ReadAllLines(@"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\AllQuantifiedPeptidesMarkovich.tsv").ToList(); + List fullSequences = new List(); + foreach (var line in quantResults) + { + string[] fields = line.Split('\t'); + fullSequences.Add(fields[0]); + } + + string psmFilePath = + @"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\AllPeptidesMarkovich.psmtsv"; + List parsedPsms = SpectrumMatchTsvReader.ReadPsmTsv(psmFilePath, out var warnings); + + List quantPeptidesFoundInAllPeptides = new List(); + foreach (var psm in parsedPsms) + { + bool containsAny = fullSequences.Any(s => psm.FullSequence.Contains(s)); + if (containsAny) + { + fullSequences.Remove(psm.FullSequence); + quantPeptidesFoundInAllPeptides.Add(psm); + } + + if (fullSequences.Count == 0) + { + break; + } + } + + List myOut = new List(); + //myOut.Add(parsedPsms[0].ToString()); + foreach (var psm in quantPeptidesFoundInAllPeptides) + { + + myOut.Add(psm.ToString()); + + } + + File.WriteAllLines(@"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\psmsFoundInQuantMarkovich.txt", myOut); + + } + [Test] + public static void Junk3() + { + + string psmFilePath = + @"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\AllPeptidesMarkovich.psmtsv"; + List parsedPsms = SpectrumMatchTsvReader.ReadPsmTsv(psmFilePath, out var warnings); + parsedPsms = parsedPsms.Where(p => p.QValue < .01).ToList(); + parsedPsms = parsedPsms.Where(p => p.DecoyContamTarget.Contains("T")).ToList(); + + List interestingMods = new List { "Y[Common Biological:Phosphorylation on Y]", "S[Common Biological:Phosphorylation on S]", "T[Common Biological:Phosphorylation on T]" }; + + Dictionary<(string, int), List> genePositionPsm = new Dictionary<(string, int), List>(); + + foreach (var psm in parsedPsms) + { + List foundMods = interestingMods.Where(s => psm.FullSequence.Contains(s)).ToList(); + if (foundMods.Any()) + { + string sequence = psm.FullSequence; + foreach (var mod in foundMods) + { + string firstCharacter = mod.Substring(0, 1); + if (firstCharacter == "Y" || firstCharacter == "S" || firstCharacter == "T") + { + sequence = sequence.Replace(mod, firstCharacter.ToLowerInvariant()); + } + } + sequence = Regex.Replace(sequence, "\\[(.*?)\\]", ""); + List lowercaseLetterPositions = new List(); + + for (int i = 0; i < sequence.Length; i++) + { + if (char.IsLower(sequence[i])) + { + lowercaseLetterPositions.Add(i); + } + } + string firstAndLastAminoAcidPositionInProtein = psm.StartAndEndResiduesInProtein.Split('|')[0]; + firstAndLastAminoAcidPositionInProtein = firstAndLastAminoAcidPositionInProtein.Substring(1, firstAndLastAminoAcidPositionInProtein.Length - 2); + firstAndLastAminoAcidPositionInProtein = firstAndLastAminoAcidPositionInProtein.Replace(" to ", "\t"); + int[] startEnd = firstAndLastAminoAcidPositionInProtein.Split('\t').Select(int.Parse).ToArray(); + lowercaseLetterPositions = lowercaseLetterPositions.Select(s => s + startEnd[0]).ToList(); + + string allGenesInPsm = psm.GeneName; + string[] genes = allGenesInPsm.Split('|'); + string firstGene = genes[0].Split(':')[1]; + + foreach (int position in lowercaseLetterPositions) + { + if (genePositionPsm.ContainsKey((firstGene, position))) + { + genePositionPsm[(firstGene, position)].Add(psm); + + } + else + { + genePositionPsm.Add((firstGene, position), new List { psm }); + } + } + } + } + + List myOut = new List(); + myOut.Add("position" + "\t" + "protein accession" + "\t" + "gene" + "\t" + "full sequence"); + + foreach (var kvp in genePositionPsm) + { + if (kvp.Value.Count > 1) + { + foreach (var psm in kvp.Value) + { + myOut.Add(kvp.Key.Item2 + "\t" + psm.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm.FullSequence); + } + } + } + + File.WriteAllLines(@"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\phosphoPeptidesMarkovich.txt", myOut); + + } + + [Test] + public static void Junk4() + { + Dictionary<(string,string),string> baseSequencefullSequencQvaluePepQvalueforPSMs = new Dictionary<(string, string), string>(); + + using (StreamReader sr = new StreamReader(@"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\AllPSMsMarkovich.psmtsv")) + { + bool continueReading = true; + string line; + while ((line = sr.ReadLine()) != null && continueReading) + { + string[] fields = line.Split('\t'); + string baseSequence = fields[12]; + string fullSequence = fields[13]; + string targetDecoyContam = fields[38]; + string qValue = fields[50]; + string pepQValue = fields[55]; + if (targetDecoyContam == "T" && double.TryParse(qValue, out double qValueDouble)) + { + if (qValueDouble < 0.01 &&!baseSequencefullSequencQvaluePepQvalueforPSMs.ContainsKey((baseSequence,fullSequence))) + { + baseSequencefullSequencQvaluePepQvalueforPSMs.Add((baseSequence, fullSequence), qValue + "\t" + pepQValue); + } + + } + + if (double.TryParse(qValue, out double qValueDouble2)) + { + if (qValueDouble2 > 0.01) + { + continueReading = false; + } + } + } + } + + string psmFilePath = + @"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\AllPeptidesMarkovich.psmtsv"; + List parsedPeptides = SpectrumMatchTsvReader.ReadPsmTsv(psmFilePath, out var warnings); + parsedPeptides = parsedPeptides.Where(p => p.DecoyContamTarget.Contains("T")).ToList(); + + List interestingMods = new List { "S[Common Biological:Phosphorylation on S]", "T[Common Biological:Phosphorylation on T]", "S[Less Common:Dehydroalanine on S]", "C[Less Common:Dehydroalanine on C]", + "S[Custom:Homocys on S]", "C[Custom:Homocys on C]", "T[Custom:Homocys on T]", "T[Less Common:Dehydrobutyrine on T]", "S[Custom:DTT on S]", "C[Custom:DTT on C]", "C[Custom:DTT on T]", "T[Custom:Glutathione on T]", + "S[Custom:Glutathione on S]", "C[Custom:Glutathione on C]", "S[Custom:TCEP on S]", "T[Custom:TCEP on T]", "C[Custom:TCEP on C]" }; + + Dictionary<(string, int), List> genePositionPeptides = new Dictionary<(string, int), List>(); + Dictionary<(string, int), List> genePositionMod = new Dictionary<(string, int), List>(); + Dictionary> noInterestingModsPeptideValues = new Dictionary>(); + + foreach (var peptide in parsedPeptides) + { + if (baseSequencefullSequencQvaluePepQvalueforPSMs.ContainsKey((peptide.BaseSeq,peptide.FullSequence))) + { + List foundMods = interestingMods.Where(s => peptide.FullSequence.Contains(s)).ToList(); + if (foundMods.Any()) + { + foreach (var mod in foundMods) + { + string sequence = peptide.FullSequence; + string firstCharacter = mod.Substring(0, 1); + if (firstCharacter == "Y" || firstCharacter == "S" || firstCharacter == "T" || + firstCharacter == "C") + { + sequence = sequence.Replace(mod, firstCharacter.ToLowerInvariant()); + } + + //eliminate the remaning mods + while (sequence.Contains("[") && sequence.Contains("]")) + { + int firstOpenBracket = sequence.IndexOf('['); + int firstCloseBracket = sequence.IndexOf(']', firstOpenBracket); + if (firstCloseBracket != -1) + { + sequence = sequence.Remove(firstOpenBracket, firstCloseBracket - firstOpenBracket + 1); + } + else + { + break; + } + } + + List lowercaseLetterPositions = new List(); + + for (int i = 0; i < sequence.Length; i++) + { + if (char.IsLower(sequence[i])) + { + lowercaseLetterPositions.Add(i); + } + } + + string firstAndLastAminoAcidPositionInProtein = + peptide.StartAndEndResiduesInProtein.Split('|')[0]; + firstAndLastAminoAcidPositionInProtein = + firstAndLastAminoAcidPositionInProtein.Substring(1, + firstAndLastAminoAcidPositionInProtein.Length - 2); + firstAndLastAminoAcidPositionInProtein = + firstAndLastAminoAcidPositionInProtein.Replace(" to ", "\t"); + int[] startEnd = firstAndLastAminoAcidPositionInProtein.Split('\t').Select(int.Parse) + .ToArray(); + lowercaseLetterPositions = lowercaseLetterPositions.Select(s => s + startEnd[0]).ToList(); + + string allGenesInPsm = peptide.GeneName; + string firstGene = ""; + if (allGenesInPsm.Contains("|")) + { + string[] genes = allGenesInPsm.Split('|'); + + if (genes[0].Contains(":")) + { + firstGene = genes[0].Split(':')[1]; + + foreach (int position in lowercaseLetterPositions) + { + if (genePositionPeptides.ContainsKey((firstGene, position))) + { + genePositionPeptides[(firstGene, position)].Add(peptide); + genePositionMod[(firstGene, position)] + .Add(mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue); + + } + else + { + genePositionPeptides.Add((firstGene, position), new List { peptide }); + genePositionMod.Add((firstGene, position), + new List { mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue }); + } + } + } + else + { + firstGene = genes[0]; + if (genes[0].Contains(":")) + { + firstGene = genes[0].Split(':')[1]; + } + foreach (int position in lowercaseLetterPositions) + { + if (genePositionPeptides.ContainsKey((firstGene, position))) + { + genePositionPeptides[(firstGene, position)].Add(peptide); + genePositionMod[(firstGene, position)] + .Add(mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue); + + } + else + { + genePositionPeptides.Add((firstGene, position), new List { peptide }); + genePositionMod.Add((firstGene, position), + new List { mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue }); + } + } + } + } + else + { + firstGene = peptide.GeneName; + if (peptide.GeneName.Contains(":")) + { + firstGene = peptide.GeneName.Split(':')[1]; + } + + foreach (int position in lowercaseLetterPositions) + { + if (genePositionPeptides.ContainsKey((firstGene, position))) + { + genePositionPeptides[(firstGene, position)].Add(peptide); + genePositionMod[(firstGene, position)] + .Add(mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue); + + } + else + { + genePositionPeptides.Add((firstGene, position), new List { peptide }); + genePositionMod.Add((firstGene, position), + new List { mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue }); + } + } + } + } + } + else + { + if (noInterestingModsPeptideValues.ContainsKey(peptide.BaseSeq)) + { + noInterestingModsPeptideValues[peptide.BaseSeq].Add(peptide); + } + else + { + noInterestingModsPeptideValues.Add(peptide.BaseSeq, new List { peptide }); + } + + } + } + } + + List myOut = new List(); + myOut.Add("position" + "\t" + "protein accession" + "\t" + "gene" + "\t" + "base sequence" + "\t" + "full sequence" + "\t" + "modification" + "\t" + "Peptide Q-value" + "\t" + "Peptide PEP Q-Value" + "\t" + "PSM Q-value" + "\t" + "PSM PEP Q-Value"); + + foreach (var kvp in genePositionPeptides) + { + if (kvp.Value.Count > 1) + { + foreach (var psm in kvp.Value) + { + int index = kvp.Value.IndexOf(psm); + if(baseSequencefullSequencQvaluePepQvalueforPSMs.ContainsKey((psm.BaseSeq, psm.FullSequence))) + { + myOut.Add(kvp.Key.Item2 + "\t" + psm.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm.BaseSeq + "\t" + psm.FullSequence + "\t" + genePositionMod[kvp.Key][index] + "\t" + baseSequencefullSequencQvaluePepQvalueforPSMs[(psm.BaseSeq, psm.FullSequence)]); + if (noInterestingModsPeptideValues.ContainsKey(psm.BaseSeq)) + { + foreach (var psm2 in noInterestingModsPeptideValues[psm.BaseSeq]) + { + myOut.Add(kvp.Key.Item2 + "\t" + psm2.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm2.BaseSeq + "\t" + psm2.FullSequence + "\t" + "No Modifications of Interest" + "\t" + psm2.QValue + "\t" + psm2.PEP_QValue + "\t" + baseSequencefullSequencQvaluePepQvalueforPSMs[(psm2.BaseSeq, psm2.FullSequence)]); + } + noInterestingModsPeptideValues.Remove(psm.BaseSeq); + } + } + else + { + myOut.Add(kvp.Key.Item2 + "\t" + psm.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm.BaseSeq + "\t" + psm.FullSequence + "\t" + genePositionMod[kvp.Key][index]); + if (noInterestingModsPeptideValues.ContainsKey(psm.BaseSeq)) + { + foreach (var psm2 in noInterestingModsPeptideValues[psm.BaseSeq]) + { + string psmQvalues = baseSequencefullSequencQvaluePepQvalueforPSMs[(psm2.BaseSeq, psm2.FullSequence)]; + myOut.Add(kvp.Key.Item2 + "\t" + psm2.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm2.BaseSeq + "\t" + psm2.FullSequence + "\t" + "No Modifications of Interest" + "\t" + psm2.QValue + "\t" + psm2.PEP_QValue + "\t" + psmQvalues); + + } + noInterestingModsPeptideValues.Remove(psm.BaseSeq); + } + } + + } + } + } + + File.WriteAllLines(@"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\modPeptidesMarkovichWithPosition.txt", myOut); + + } + + + + [Test] + public static void Junk5() + { + List interestingMods = new List { "S[Common Biological:Phosphorylation on S]", "T[Common Biological:Phosphorylation on T]", "S[Less Common:Dehydroalanine on S]", "C[Less Common:Dehydroalanine on C]", + "S[Custom:Homocys on S]", "C[Custom:Homocys on C]", "T[Custom:Homocys on T]", "T[Less Common:Dehydrobutyrine on T]", "S[Custom:DTT on S]", "C[Custom:DTT on C]", "C[Custom:DTT on T]", "T[Custom:Glutathione on T]", + "S[Custom:Glutathione on S]", "C[Custom:Glutathione on C]", "S[Custom:TCEP on S]", "T[Custom:TCEP on T]", "C[Custom:TCEP on C]" }; + + //string is position accession gene + //dictionary key is mod and int is count + Dictionary> bubba = new Dictionary>(); + using (StreamReader sr = + new StreamReader( + @"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\modPeptidesMarkovichWithPosition.txt")) + { + bool continueReading = true; + string line; + + while ((line = sr.ReadLine()) != null && continueReading) + { + + string[] fields = line.Split('\t'); + string myKey = fields[0] + "\t" + fields[1] + "\t" + fields[2]; + + bool goodLine = (double.TryParse(fields[5], out double qvalue) && qvalue < 0.01 ); + + if (goodLine) + { + if (bubba.ContainsKey(myKey)) + { + if (bubba[myKey].ContainsKey(fields[4])) + { + bubba[myKey][fields[4]]++; + } + else + { + bubba[myKey].Add(fields[4], 1); + } + } + else + { + bubba.Add(myKey, new Dictionary { { fields[4], 1 } }); + } + } + } + } + + List myOut = new List(); + myOut.Add("position" + "\t" + "protein accession" + "\t" + "gene" + "\t" + String.Join('\t',interestingMods)); + + foreach (var kvp in bubba) + { + string myLine = kvp.Key +"\t"; + foreach (var mod in interestingMods) + { + if (kvp.Value.ContainsKey(mod)) + { + myLine += kvp.Value[mod] + "\t"; + } + else + { + myLine += "0\t"; + } + } + myOut.Add(myLine); + } + + File.WriteAllLines(@"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\tableAllPeptidesMarkovich.txt", myOut); + + } + + [Test] + public static void Junk6() + { + Dictionary<(int,string,string),List> modPeptidesWithPositionLines = new Dictionary<(int, string, string), List>(); + Dictionary<(int, string, string), List> modPeptidesWithPositionFullSequences = new Dictionary<(int, string, string), List>(); + Dictionary<(int, string, string), List> modPeptidesWithPositionBaseSequences = new Dictionary<(int, string, string), List>(); + Dictionary> basePeptideToFullSequences = new Dictionary>(); + + using (StreamReader sr = + new StreamReader( + @"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\modPeptides180WithPosition.txt")) + { + bool continueReading = true; + string line; + bool firstLine = true; + + while ((line = sr.ReadLine()) != null && continueReading) + { + if (!firstLine) + { + string[] fields = line.Split('\t'); + string myKey = fields[0] + "\t" + fields[1] + "\t" + fields[2]; + + string baseSequence = fields[3]; + while (baseSequence.Contains("[") && baseSequence.Contains("]")) + { + int firstOpenBracket = baseSequence.IndexOf('['); + int firstCloseBracket = baseSequence.IndexOf(']', firstOpenBracket); + if (firstCloseBracket != -1) + { + baseSequence = baseSequence.Remove(firstOpenBracket, firstCloseBracket - firstOpenBracket + 1); + } + else + { + break; + } + } + + + + + if (modPeptidesWithPositionLines.ContainsKey((int.Parse(fields[0]), fields[1], fields[2]))) + { + modPeptidesWithPositionLines[(int.Parse(fields[0]), fields[1], fields[2])].Add(line); + } + else + { + modPeptidesWithPositionLines.Add((int.Parse(fields[0]), fields[1], fields[2]), new List { line }); + } + + if (modPeptidesWithPositionFullSequences.ContainsKey((int.Parse(fields[0]), fields[1], + fields[2]))) + { + modPeptidesWithPositionFullSequences[(int.Parse(fields[0]), fields[1], fields[2])].Add(fields[3]); + } + else + { + modPeptidesWithPositionFullSequences.Add((int.Parse(fields[0]), fields[1], fields[2]), new List { fields[3] }); + } + + if (modPeptidesWithPositionBaseSequences.ContainsKey((int.Parse(fields[0]), fields[1], + fields[2]))) + { + modPeptidesWithPositionBaseSequences[(int.Parse(fields[0]), fields[1], fields[2])].Add(baseSequence); + } + else + { + modPeptidesWithPositionBaseSequences.Add((int.Parse(fields[0]), fields[1], fields[2]), new List { baseSequence }); + } + + if (basePeptideToFullSequences.ContainsKey(baseSequence)) + { + basePeptideToFullSequences[baseSequence].Add(fields[3]); + } + else + { + basePeptideToFullSequences.Add(baseSequence, new List { fields[3] }); + } + } + firstLine = false; + } + } + + foreach (var kvp in modPeptidesWithPositionBaseSequences) + { + modPeptidesWithPositionBaseSequences[kvp.Key] = kvp.Value.Distinct().ToList(); + } + + foreach (var kvp in basePeptideToFullSequences) + { + basePeptideToFullSequences[kvp.Key] = kvp.Value.Distinct().ToList(); + } + } + + [Test] + public static void Junk7() + { + Dictionary<(string, string), string> baseSequencefullSequencQvaluePepQvalueforPSMs = new Dictionary<(string, string), string>(); + + using (StreamReader sr = new StreamReader(@"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\AllPSMsMarkovich.psmtsv")) + { + bool continueReading = true; + string line; + while ((line = sr.ReadLine()) != null && continueReading) + { + string[] fields = line.Split('\t'); + string baseSequence = fields[12]; + string fullSequence = fields[13]; + string targetDecoyContam = fields[38]; + string qValue = fields[50]; + string pepQValue = fields[55]; + if (targetDecoyContam == "T" && double.TryParse(qValue, out double qValueDouble)) + { + if (qValueDouble < 0.01 && !baseSequencefullSequencQvaluePepQvalueforPSMs.ContainsKey((baseSequence, fullSequence))) + { + baseSequencefullSequencQvaluePepQvalueforPSMs.Add((baseSequence, fullSequence), qValue + "\t" + pepQValue); + } + + } + + if (double.TryParse(qValue, out double qValueDouble2)) + { + if (qValueDouble2 > 0.01) + { + continueReading = false; + } + } + } + } + + string psmFilePath = + @"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\AllPeptidesMarkovich.psmtsv"; + List parsedPeptides = SpectrumMatchTsvReader.ReadPsmTsv(psmFilePath, out var warnings); + parsedPeptides = parsedPeptides.Where(p => p.DecoyContamTarget.Contains("T")).ToList(); + + List interestingMods = new List { "S[Common Biological:Phosphorylation on S]", "T[Common Biological:Phosphorylation on T]", "S[Less Common:Dehydroalanine on S]", "C[Less Common:Dehydroalanine on C]", + "S[Custom:Homocys on S]", "C[Custom:Homocys on C]", "T[Custom:Homocys on T]", "T[Less Common:Dehydrobutyrine on T]", "S[Custom:DTT on S]", "C[Custom:DTT on C]", "C[Custom:DTT on T]", "T[Custom:Glutathione on T]", + "S[Custom:Glutathione on S]", "C[Custom:Glutathione on C]", "S[Custom:TCEP on S]", "T[Custom:TCEP on T]", "C[Custom:TCEP on C]" }; + + Dictionary<(string, int), List> genePositionPeptides = new Dictionary<(string, int), List>(); + Dictionary<(string, int), List> genePositionMod = new Dictionary<(string, int), List>(); + Dictionary> noInterestingModsPeptideValues = new Dictionary>(); + + foreach (var peptide in parsedPeptides) + { + if (baseSequencefullSequencQvaluePepQvalueforPSMs.ContainsKey((peptide.BaseSeq, peptide.FullSequence))) + { + List foundMods = interestingMods.Where(s => peptide.FullSequence.Contains(s)).ToList(); + if (foundMods.Any()) + { + foreach (var mod in foundMods) + { + string sequence = peptide.FullSequence; + string firstCharacter = mod.Substring(0, 1); + if (firstCharacter == "Y" || firstCharacter == "S" || firstCharacter == "T" || + firstCharacter == "C") + { + sequence = sequence.Replace(mod, firstCharacter.ToLowerInvariant()); + } + + sequence = sequence.Replace("[I]", ""); + sequence = sequence.Replace("[II]", ""); + sequence = sequence.Replace("[III]", ""); + + //eliminate the remaning mods + while (sequence.Contains("[") && sequence.Contains("]")) + { + int firstOpenBracket = sequence.IndexOf('['); + int firstCloseBracket = sequence.IndexOf(']', firstOpenBracket); + if (firstCloseBracket != -1) + { + sequence = sequence.Remove(firstOpenBracket, firstCloseBracket - firstOpenBracket + 1); + } + else + { + break; + } + } + + List lowercaseLetterPositions = new List(); + + for (int i = 0; i < sequence.Length; i++) + { + if (char.IsLower(sequence[i])) + { + lowercaseLetterPositions.Add(i); + } + } + + string firstAndLastAminoAcidPositionInProtein = + peptide.StartAndEndResiduesInProtein.Split('|')[0]; + firstAndLastAminoAcidPositionInProtein = + firstAndLastAminoAcidPositionInProtein.Substring(1, + firstAndLastAminoAcidPositionInProtein.Length - 2); + firstAndLastAminoAcidPositionInProtein = + firstAndLastAminoAcidPositionInProtein.Replace(" to ", "\t"); + int[] startEnd = firstAndLastAminoAcidPositionInProtein.Split('\t').Select(int.Parse) + .ToArray(); + lowercaseLetterPositions = lowercaseLetterPositions.Select(s => s + startEnd[0]).ToList(); + + string allGenesInPsm = peptide.GeneName; + string firstGene = ""; + if (allGenesInPsm.Contains("|")) + { + string[] genes = allGenesInPsm.Split('|'); + + if (genes[0].Contains(":")) + { + firstGene = genes[0].Split(':')[1]; + + foreach (int position in lowercaseLetterPositions) + { + if (genePositionPeptides.ContainsKey((firstGene, position))) + { + genePositionPeptides[(firstGene, position)].Add(peptide); + genePositionMod[(firstGene, position)] + .Add(mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue); + + } + else + { + genePositionPeptides.Add((firstGene, position), new List { peptide }); + genePositionMod.Add((firstGene, position), + new List { mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue }); + } + } + } + else + { + firstGene = genes[0]; + if (genes[0].Contains(":")) + { + firstGene = genes[0].Split(':')[1]; + } + foreach (int position in lowercaseLetterPositions) + { + if (genePositionPeptides.ContainsKey((firstGene, position))) + { + genePositionPeptides[(firstGene, position)].Add(peptide); + genePositionMod[(firstGene, position)] + .Add(mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue); + + } + else + { + genePositionPeptides.Add((firstGene, position), new List { peptide }); + genePositionMod.Add((firstGene, position), + new List { mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue }); + } + } + } + } + else + { + firstGene = peptide.GeneName; + if (peptide.GeneName.Contains(":")) + { + firstGene = peptide.GeneName.Split(':')[1]; + } + + foreach (int position in lowercaseLetterPositions) + { + if (genePositionPeptides.ContainsKey((firstGene, position))) + { + genePositionPeptides[(firstGene, position)].Add(peptide); + genePositionMod[(firstGene, position)] + .Add(mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue); + + } + else + { + genePositionPeptides.Add((firstGene, position), new List { peptide }); + genePositionMod.Add((firstGene, position), + new List { mod + "\t" + peptide.QValue + "\t" + peptide.PEP_QValue }); + } + } + } + } + } + else + { + if (noInterestingModsPeptideValues.ContainsKey(peptide.BaseSeq)) + { + noInterestingModsPeptideValues[peptide.BaseSeq].Add(peptide); + } + else + { + noInterestingModsPeptideValues.Add(peptide.BaseSeq, new List { peptide }); + } + + } + } + } + + List<(string,string)> outList = new List<(string, string)>(); + List myOut = new List(); + myOut.Add("position" + "\t" + "protein accession" + "\t" + "gene" + "\t" + "base sequence" + "\t" + "full sequence" + "\t" + "modification" + "\t" + "Peptide Q-value" + "\t" + "Peptide PEP Q-Value" + "\t" + "PSM Q-value" + "\t" + "PSM PEP Q-Value"); + + foreach (var kvp in genePositionPeptides) + { + if (kvp.Value.Count > 1) + { + foreach (var psm in kvp.Value) + { + int index = kvp.Value.IndexOf(psm); + if (baseSequencefullSequencQvaluePepQvalueforPSMs.ContainsKey((psm.BaseSeq, psm.FullSequence))) + { + myOut.Add(kvp.Key.Item2 + "\t" + psm.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm.BaseSeq + "\t" + psm.FullSequence + "\t" + genePositionMod[kvp.Key][index] + "\t" + baseSequencefullSequencQvaluePepQvalueforPSMs[(psm.BaseSeq, psm.FullSequence)]); + outList.Add((psm.FullSequence, kvp.Key.Item2 + "\t" + psm.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm.BaseSeq + "\t" + psm.FullSequence + "\t" + genePositionMod[kvp.Key][index] + "\t" + baseSequencefullSequencQvaluePepQvalueforPSMs[(psm.BaseSeq, psm.FullSequence)])); + if (noInterestingModsPeptideValues.ContainsKey(psm.BaseSeq)) + { + foreach (var psm2 in noInterestingModsPeptideValues[psm.BaseSeq]) + { + myOut.Add(kvp.Key.Item2 + "\t" + psm2.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm2.BaseSeq + "\t" + psm2.FullSequence + "\t" + "No Modifications of Interest" + "\t" + psm2.QValue + "\t" + psm2.PEP_QValue + "\t" + baseSequencefullSequencQvaluePepQvalueforPSMs[(psm2.BaseSeq, psm2.FullSequence)]); + outList.Add((psm2.FullSequence, kvp.Key.Item2 + "\t" + psm2.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm2.BaseSeq + "\t" + psm2.FullSequence + "\t" + "No Modifications of Interest" + "\t" + psm2.QValue + "\t" + psm2.PEP_QValue + "\t" + baseSequencefullSequencQvaluePepQvalueforPSMs[(psm2.BaseSeq, psm2.FullSequence)])); + } + noInterestingModsPeptideValues.Remove(psm.BaseSeq); + } + } + else + { + //myOut.Add(kvp.Key.Item2 + "\t" + psm.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm.BaseSeq + "\t" + psm.FullSequence + "\t" + genePositionMod[kvp.Key][index]); + + //if (noInterestingModsPeptideValues.ContainsKey(psm.BaseSeq)) + //{ + // foreach (var psm2 in noInterestingModsPeptideValues[psm.BaseSeq]) + // { + // string psmQvalues = baseSequencefullSequencQvaluePepQvalueforPSMs[(psm2.BaseSeq, psm2.FullSequence)]; + // myOut.Add(kvp.Key.Item2 + "\t" + psm2.ProteinAccession + "\t" + kvp.Key.Item1 + "\t" + psm2.BaseSeq + "\t" + psm2.FullSequence + "\t" + "No Modifications of Interest" + "\t" + psm2.QValue + "\t" + psm2.PEP_QValue + "\t" + psmQvalues); + + // } + // noInterestingModsPeptideValues.Remove(psm.BaseSeq); + //} + } + + } + } + } + + Dictionary quantDict = new Dictionary(); + using (StreamReader sr = + new StreamReader( + @"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\QuantifiedPeptidesNormalizedMarkovich.tsv")) + { + string line; + while ((line = sr.ReadLine()) != null) + { + string[] fields = line.Split('\t'); + string fullSequence = fields[0]; + if (!quantDict.ContainsKey(fullSequence)) + { + quantDict.Add(fullSequence, line); + } + } + } + + List newOutList = new List(); + newOutList.Add(myOut[0] + "\t" + quantDict["Sequence"]); + foreach (var kvp in outList) + { + if (quantDict.ContainsKey(kvp.Item1)) + { + newOutList.Add(kvp.Item2 + "\t" + quantDict[kvp.Item1]); + } + else + { + newOutList.Add(kvp.Item2); + } + } + + File.WriteAllLines(@"C:\Users\Michael Shortreed\Downloads\4-26-24 AD Brain DHAA Analysis\modPeptidesMarkovichWithPositionAndQuant_new.txt", newOutList); + } + + [Test] public void SpectrumCount() {