diff --git a/src/TopDownProteomics/ProForma/TopPicProformaParser.cs b/src/TopDownProteomics/ProForma/TopPicProformaParser.cs new file mode 100644 index 0000000..9877e34 --- /dev/null +++ b/src/TopDownProteomics/ProForma/TopPicProformaParser.cs @@ -0,0 +1,216 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text.RegularExpressions; + +namespace TopDownProteomics.ProForma; + +/// +/// A parser for TopPIC strings into a ProformaTerm +/// +public class TopPicProformaParser +{ + IDictionary>? _modLookup = null; + + #region Regex strings + Regex _modRx = new(@"\(([A-Z]{1,})\)(\[.+?\])+"); + Regex _numberRx = new(@"(-?\+?[0-9]+.[0-9]+)"); + Regex _terminalAaRx = new(@"\P{N}(\.)\P{N}??|\P{N}??(\.)\P{N}"); + Regex _strippedSequenceRx = new(@"\[.+?\]|[()]"); + #endregion + + /// + /// Initializes a new instance of the class. + /// + public TopPicProformaParser() { } + + /// + /// Initializes a new instance of the class. + /// + /// The mod.txt file for mapping modifications. + public TopPicProformaParser(string modFile) + { + _modLookup = ParseModFile(new FileInfo(modFile).OpenRead()); ; + } + + /// + /// Initializes a new instance of the class. + /// + /// The mod stream. + public TopPicProformaParser(Stream modStream) + { + _modLookup = ParseModFile(modStream); + } + + /// + /// Gets the proforma term. + /// + /// The sequence. + /// + public ProFormaTerm ParseTopPicString(string sequence) + { + //first remove terminal AA tags if there! + sequence = RemoveTerminalAAs(sequence); + var (nTerms, cTerms, tags) = FindPTMs(sequence); + return new ProFormaTerm(GetFullyStrippedSequence(sequence), tags, nTerms, cTerms); + } + + private IDictionary> ParseModFile(Stream modStream) + { + IDictionary> modLookup = new Dictionary>(); + + using StreamReader reader = new StreamReader(modStream); + + while (!reader.EndOfStream) + { + var line = reader.ReadLine(); + + if (string.IsNullOrWhiteSpace(line) | line.StartsWith("#")) + continue; + + //# To input a modification, use the following format: + //# Name,Mass,Residues,Position,UnimodID + var splitLine = line.Split(','); + + if (splitLine.Length != 5) + throw new TopPicParserException("Failed to parse mod file"); + + var name = splitLine[0]; + + if (int.TryParse(splitLine[4], out var uniModNumber)) + { + if (uniModNumber > 0) + modLookup.Add(name, new List() + { + new ProFormaDescriptor(ProFormaKey.Identifier, ProFormaEvidenceType.Unimod, $"UNIMOD:{uniModNumber}"), + new ProFormaDescriptor(ProFormaKey.Info, name) + }); + else if (uniModNumber == -1 && double.TryParse(splitLine[1], out var mass)) + modLookup.Add(name, new List() + { + new ProFormaDescriptor(ProFormaKey.Mass, $"{mass:+#.000000;-#.000000}"), + new ProFormaDescriptor(ProFormaKey.Info, name) + }); + else + throw new TopPicParserException($"invalid UniMod Id or mass"); + } + else + throw new TopPicParserException($"Failed to parse UniMod Id {splitLine[4]}".Trim()); + } + return modLookup; + } + + private string GetFullyStrippedSequence(string sequence) => _strippedSequenceRx.Replace(sequence, ""); + + private Dictionary GetIndexLookup(string sequence) + { + Dictionary indexLookup = new Dictionary(); + + bool inBracket = false; + int index = 0; + for (int i = 0; i < sequence.Length; i++) + { + char c = sequence[i]; + if (c == '[') + inBracket = true; + else if (c == ']') + inBracket = false; + else if (char.IsUpper(c) && !inBracket) + { + indexLookup[i] = index++; + } + } + return indexLookup; + } + + private Tuple, IList, IList> FindPTMs(string sequence) + { + var indexLookup = GetIndexLookup(sequence); + + List nTerms = new List(); + List cTerms = new List(); + List tags = new List(); + + foreach (Match match in _modRx.Matches(sequence)) + { + var startIndex = indexLookup[match.Groups[1].Index]; + var ptms = match.Groups[2].Captures; + + if (ptms.Count > 1) + throw new TopPicParserException("multiple mods are not currently supported"); + + if (startIndex == 0 && match.Groups[1].Length == 1) // check for ambiguous mods that include the start -> just make tags + { + nTerms = ParsePtms(ptms); + } + else if (startIndex == indexLookup.Max(x => x.Value)) + { + cTerms = ParsePtms(ptms); + } + else if (match.Groups[1].Length > 1) + { + var EndIndex = startIndex + match.Groups[1].Length - 1; + tags.Add(new ProFormaTag(startIndex, EndIndex, ParsePtms(ptms))); + } + else + tags.Add(new ProFormaTag(startIndex, ParsePtms(ptms))); + } + return new Tuple, IList, IList>(nTerms, cTerms, tags); + } + + private List ParsePtms(CaptureCollection ptms) + { + var proformaList = new List(); + + foreach (var ptm in ptms) + proformaList.AddRange(ParsePtmString(ptm.ToString())); + + return proformaList; + } + + private IList ParsePtmString(string ptmString) + { + //strip [] + ptmString = ptmString.Substring(1, ptmString.Length - 2); + var numberMatch = _numberRx.Match(ptmString); + + if (numberMatch.Success && Double.TryParse(numberMatch.Value, out double val)) + return new List() { new ProFormaDescriptor(ProFormaKey.Mass, $"{val:+#.0000;-#.0000;0}")}; + + // Find and throw exception if there is a * + if (ptmString.Contains('*')) + throw new TopPicParserException("multiple mods are not currently supported"); + + if (_modLookup?.ContainsKey(ptmString) == true) + return _modLookup[ptmString]; + else + return new List() { new ProFormaDescriptor(ptmString) }; + } + + private string RemoveTerminalAAs(string sequence) + { + var matches = _terminalAaRx.Matches(sequence); + + if (matches.Count > 0) + { + var startIndex = matches[0].Groups[1].Index + 1; + var length = matches[1].Groups[1].Index - startIndex; + sequence = sequence.Substring(startIndex, length); + } + return sequence; + } +} + +/// +/// An exception for the TopPIC to ProForma parser. +/// +/// +public class TopPicParserException : Exception +{ + /// + /// Initializes a new instance of the class. + /// + /// The message that describes the error. + public TopPicParserException(string message) : base(message) { } +} \ No newline at end of file diff --git a/tests/TopDownProteomics.Tests/ProForma/ToPicParserTests.cs b/tests/TopDownProteomics.Tests/ProForma/ToPicParserTests.cs new file mode 100644 index 0000000..787c312 --- /dev/null +++ b/tests/TopDownProteomics.Tests/ProForma/ToPicParserTests.cs @@ -0,0 +1,100 @@ +using NUnit.Framework; +using System.IO; +using System.Text; +using TopDownProteomics.ProForma; + +namespace TopDownProteomics.Tests.ProForma; + +/// +/// Tests for the TopPicProformaParser +/// +[TestFixture] +public class ToPicParserTests +{ + private static string GetTestDataFile(string name) => Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", name); + + /// + /// Tests the TopPic Proforma Parser. + /// + [Test] + [TestCase("M.A(AAA)[Phospho]AAA.C", "A(AAA)[UNIMOD:21|Info:Phospho]AAA")] + [TestCase("W.(G)[Oxidation]DGCAQKNKPGVYTK(V)[Phospho]YNYVKWIKNTIAANS.", "[UNIMOD:35|Info:Oxidation]-GDGCAQKNKPGVYTKV[UNIMOD:21|Info:Phospho]YNYVKWIKNTIAANS")] + [TestCase(".GDGCAQKNKPGVYTK(V)[Phospho]YNYVKWIKNTIAANS.", "GDGCAQKNKPGVYTKV[UNIMOD:21|Info:Phospho]YNYVKWIKNTIAANS")] + [TestCase("W.GDGCAQKNKPGVYTKVYNYVKWIKNTIAAN(S)[Phospho].", "GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS-[UNIMOD:21|Info:Phospho]")] + [TestCase("W.GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")] + [TestCase(".(G)[Test1]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+59.000000|Info:Test1]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")] + [TestCase("W.(G)[T@s!1]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[T@s!1]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")] + [TestCase("W.(G)[Test_2]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+59.000000|Info:Test_2]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")] + [TestCase(".(G)[Ox_plus1]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+17.123000|Info:Ox_plus1]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")] + [TestCase(".(G)[+23.9987]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+23.9987]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")] + [TestCase(".(G)[23.9987]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+23.9987]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")] + [TestCase(".(G)[-23.9987]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[-23.9987]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")] + public void CompareToProForma(string topPIC, string proForma) + { + var topicParser = new TopPicProformaParser(GetTestDataFile("topPicTestMods.txt")); + var term = topicParser.ParseTopPicString(topPIC); + + var writer = new ProFormaWriter(); + + Assert.AreEqual(proForma, writer.WriteString(term)); + } + + /// + /// Tests the TopPic Proforma Parser with no mod file. + /// + [Test] + [TestCase("M.A(AAA)[Phospho]AAA.C", "A(AAA)[Phospho]AAA")] + [TestCase("W.(G)[Oxidation]DGCAQKNKPGVYTK(V)[Phospho]YNYVKWIKNTIAANS.", "[Oxidation]-GDGCAQKNKPGVYTKV[Phospho]YNYVKWIKNTIAANS")] + [TestCase("W.(G)[asdf4fdfsd6!]DGCAQKNKPGVYTKYNYVKWIKNTIAANS.", "[asdf4fdfsd6!]-GDGCAQKNKPGVYTKYNYVKWIKNTIAANS")] + public void CompareToProFormaNoModFile(string topPIC, string proForma) + { + var topicParser = new TopPicProformaParser(); + var term = topicParser.ParseTopPicString(topPIC); + + var writer = new ProFormaWriter(); + + Assert.AreEqual(proForma, writer.WriteString(term)); + } + + /// + /// Testing Exceptions. + /// + /// The top pic. + [Test] + [TestCase("M.A(AAA)[Phospho*4]AAA.C", "multiple mods are not currently supported")] + [TestCase("M.A(AAA)[Phospho][Phospho]AAA.C", "multiple mods are not currently supported")] + public void ParsingExceptionTesting(string topPIC, string exMessage) + { + var topicParser = new TopPicProformaParser(GetTestDataFile("topPicTestMods.txt")); + + TestDelegate throwTest = () => + { + var term = topicParser.ParseTopPicString(topPIC); + }; + + TopPicParserException ex = Assert.Throws(throwTest); + Assert.AreEqual(exMessage, ex.Message); + } + + /// + /// Testing Exceptions. + /// + /// The top pic. + [Test] + [TestCase(@"Phospho,79.966331,STY,any,21,54", "Failed to parse mod file")] + [TestCase(@"Phospho,79.966331,STY,any,2O", "Failed to parse UniMod Id 2O")] + [TestCase(@"Phospho,79b.966331,STY,any,-1", "invalid UniMod Id or mass")] + [TestCase(@"Phospho,79.966331,STY,any,-5", "invalid UniMod Id or mass")] + public void ModFilePArsingExceptionTesting(string modFileString, string exMessage) + { + MemoryStream stream = new MemoryStream(Encoding.UTF8.GetBytes(modFileString)); + + TestDelegate throwTest = () => + { + var topicParser = new TopPicProformaParser(stream); + }; + + TopPicParserException ex = Assert.Throws(throwTest); + Assert.AreEqual(exMessage, ex.Message); + } +} \ No newline at end of file diff --git a/tests/TopDownProteomics.Tests/TestData/topPicTestMods.txt b/tests/TopDownProteomics.Tests/TestData/topPicTestMods.txt new file mode 100644 index 0000000..9ad1ed8 --- /dev/null +++ b/tests/TopDownProteomics.Tests/TestData/topPicTestMods.txt @@ -0,0 +1,38 @@ +# This file is used to specify modifications +# # for comments +# To input a modification, use the following format: +# +# Name,Mass,Residues,Position,UnimodID +# +# Name: name of the modification (Unimod PSI-MS name) +# - The Unimod PSI-MS names are recommended +# - E.g. Phospho, Acetyl +# - Visit http://www.unimod.org to get PSI-MS names. +# +# Mass: monoisotopic mass of modification. +# - It is important to specify accurate masses (integer masses are insufficient). +# - E.g. 15.994915 +# +# Residues: amino acids that can be modified +# - Use * if this modification is applicable to all the 20 amino acids. +# +# Position: positions in the protein where the modification can be attached. +# - Only "any" can be used for anywhere +# +# UnimodID: unmimod id of the modification +# - Please use -1, if not in unimod + +# Methionine oxidation +Oxidation,15.994915,M,any,35 + +# Phosphorylation +Phospho,79.966331,STY,any,21 + +# test1 +Test1,59.0000,STY,any,-1 + +# test2 +Test_2,59.0000,STY,any,-1 + +# test3 +Ox_plus1,17.1230,STY,any,-1 \ No newline at end of file