+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text.RegularExpressions;
+namespace TopDownProteomics.ProForma;
+/// A parser for TopPIC strings into a ProformaTerm
+public class TopPicProformaParser
+ IDictionary>? _modLookup = null;
+ #region Regex strings
+ Regex _modRx = new(@"\(([A-Z]{1,})\)(\[.+?\])+");
+ Regex _numberRx = new(@"(-?\+?[0-9]+.[0-9]+)");
+ Regex _terminalAaRx = new(@"\P{N}(\.)\P{N}??|\P{N}??(\.)\P{N}");
+ Regex _strippedSequenceRx = new(@"\[.+?\]|[()]");
+ #endregion
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ public TopPicProformaParser() { }
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The mod.txt file for mapping modifications.
+ public TopPicProformaParser(string modFile)
+ {
+ _modLookup = ParseModFile(new FileInfo(modFile).OpenRead()); ;
+ }
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The mod stream.
+ public TopPicProformaParser(Stream modStream)
+ {
+ _modLookup = ParseModFile(modStream);
+ }
+ ///
+ /// Gets the proforma term.
+ ///
+ /// The sequence.
+ ///
+ public ProFormaTerm ParseTopPicString(string sequence)
+ {
+ //first remove terminal AA tags if there!
+ sequence = RemoveTerminalAAs(sequence);
+ var (nTerms, cTerms, tags) = FindPTMs(sequence);
+ return new ProFormaTerm(GetFullyStrippedSequence(sequence), tags, nTerms, cTerms);
+ }
+ private IDictionary> ParseModFile(Stream modStream)
+ {
+ IDictionary> modLookup = new Dictionary>();
+ using StreamReader reader = new StreamReader(modStream);
+ while (!reader.EndOfStream)
+ {
+ var line = reader.ReadLine();
+ if (string.IsNullOrWhiteSpace(line) | line.StartsWith("#"))
+ continue;
+ //# To input a modification, use the following format:
+ //# Name,Mass,Residues,Position,UnimodID
+ var splitLine = line.Split(',');
+ if (splitLine.Length != 5)
+ throw new TopPicParserException("Failed to parse mod file");
+ var name = splitLine[0];
+ if (int.TryParse(splitLine[4], out var uniModNumber))
+ {
+ if (uniModNumber > 0)
+ modLookup.Add(name, new List()
+ {
+ new ProFormaDescriptor(ProFormaKey.Identifier, ProFormaEvidenceType.Unimod, $"UNIMOD:{uniModNumber}"),
+ new ProFormaDescriptor(ProFormaKey.Info, name)
+ });
+ else if (uniModNumber == -1 && double.TryParse(splitLine[1], out var mass))
+ modLookup.Add(name, new List()
+ {
+ new ProFormaDescriptor(ProFormaKey.Mass, $"{mass:+#.000000;-#.000000}"),
+ new ProFormaDescriptor(ProFormaKey.Info, name)
+ });
+ else
+ throw new TopPicParserException($"invalid UniMod Id or mass");
+ }
+ else
+ throw new TopPicParserException($"Failed to parse UniMod Id {splitLine[4]}".Trim());
+ }
+ return modLookup;
+ }
+ private string GetFullyStrippedSequence(string sequence) => _strippedSequenceRx.Replace(sequence, "");
+ private Dictionary GetIndexLookup(string sequence)
+ {
+ Dictionary indexLookup = new Dictionary();
+ bool inBracket = false;
+ int index = 0;
+ for (int i = 0; i < sequence.Length; i++)
+ {
+ char c = sequence[i];
+ if (c == '[')
+ inBracket = true;
+ else if (c == ']')
+ inBracket = false;
+ else if (char.IsUpper(c) && !inBracket)
+ {
+ indexLookup[i] = index++;
+ }
+ }
+ return indexLookup;
+ }
+ private Tuple, IList, IList> FindPTMs(string sequence)
+ {
+ var indexLookup = GetIndexLookup(sequence);
+ List nTerms = new List();
+ List cTerms = new List();
+ List tags = new List();
+ foreach (Match match in _modRx.Matches(sequence))
+ {
+ var startIndex = indexLookup[match.Groups[1].Index];
+ var ptms = match.Groups[2].Captures;
+ if (ptms.Count > 1)
+ throw new TopPicParserException("multiple mods are not currently supported");
+ if (startIndex == 0 && match.Groups[1].Length == 1) // check for ambiguous mods that include the start -> just make tags
+ {
+ nTerms = ParsePtms(ptms);
+ }
+ else if (startIndex == indexLookup.Max(x => x.Value))
+ {
+ cTerms = ParsePtms(ptms);
+ }
+ else if (match.Groups[1].Length > 1)
+ {
+ var EndIndex = startIndex + match.Groups[1].Length - 1;
+ tags.Add(new ProFormaTag(startIndex, EndIndex, ParsePtms(ptms)));
+ }
+ else
+ tags.Add(new ProFormaTag(startIndex, ParsePtms(ptms)));
+ }
+ return new Tuple, IList, IList>(nTerms, cTerms, tags);
+ }
+ private List ParsePtms(CaptureCollection ptms)
+ {
+ var proformaList = new List();
+ foreach (var ptm in ptms)
+ proformaList.AddRange(ParsePtmString(ptm.ToString()));
+ return proformaList;
+ }
+ private IList ParsePtmString(string ptmString)
+ {
+ //strip []
+ ptmString = ptmString.Substring(1, ptmString.Length - 2);
+ var numberMatch = _numberRx.Match(ptmString);
+ if (numberMatch.Success && Double.TryParse(numberMatch.Value, out double val))
+ return new List() { new ProFormaDescriptor(ProFormaKey.Mass, $"{val:+#.0000;-#.0000;0}")};
+ // Find and throw exception if there is a *
+ if (ptmString.Contains('*'))
+ throw new TopPicParserException("multiple mods are not currently supported");
+ if (_modLookup?.ContainsKey(ptmString) == true)
+ return _modLookup[ptmString];
+ else
+ return new List() { new ProFormaDescriptor(ptmString) };
+ }
+ private string RemoveTerminalAAs(string sequence)
+ {
+ var matches = _terminalAaRx.Matches(sequence);
+ if (matches.Count > 0)
+ {
+ var startIndex = matches[0].Groups[1].Index + 1;
+ var length = matches[1].Groups[1].Index - startIndex;
+ sequence = sequence.Substring(startIndex, length);
+ }
+ return sequence;
+ }
+/// An exception for the TopPIC to ProForma parser.
+public class TopPicParserException : Exception
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The message that describes the error.
+ public TopPicParserException(string message) : base(message) { }
\ No newline at end of file
+using NUnit.Framework;
+using System.IO;
+using System.Text;
+using TopDownProteomics.ProForma;
+namespace TopDownProteomics.Tests.ProForma;
+/// Tests for the TopPicProformaParser
+public class ToPicParserTests
+ private static string GetTestDataFile(string name) => Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", name);
+ ///
+ /// Tests the TopPic Proforma Parser.
+ ///
+ [Test]
+ [TestCase("M.A(AAA)[Phospho]AAA.C", "A(AAA)[UNIMOD:21|Info:Phospho]AAA")]
+ public void CompareToProForma(string topPIC, string proForma)
+ {
+ var topicParser = new TopPicProformaParser(GetTestDataFile("topPicTestMods.txt"));
+ var term = topicParser.ParseTopPicString(topPIC);
+ var writer = new ProFormaWriter();
+ Assert.AreEqual(proForma, writer.WriteString(term));
+ }
+ ///
+ /// Tests the TopPic Proforma Parser with no mod file.
+ ///
+ [Test]
+ [TestCase("M.A(AAA)[Phospho]AAA.C", "A(AAA)[Phospho]AAA")]
+ public void CompareToProFormaNoModFile(string topPIC, string proForma)
+ {
+ var topicParser = new TopPicProformaParser();
+ var term = topicParser.ParseTopPicString(topPIC);
+ var writer = new ProFormaWriter();
+ Assert.AreEqual(proForma, writer.WriteString(term));
+ }
+ ///
+ /// Testing Exceptions.
+ ///
+ /// The top pic.
+ [Test]
+ [TestCase("M.A(AAA)[Phospho*4]AAA.C", "multiple mods are not currently supported")]
+ [TestCase("M.A(AAA)[Phospho][Phospho]AAA.C", "multiple mods are not currently supported")]
+ public void ParsingExceptionTesting(string topPIC, string exMessage)
+ {
+ var topicParser = new TopPicProformaParser(GetTestDataFile("topPicTestMods.txt"));
+ TestDelegate throwTest = () =>
+ {
+ var term = topicParser.ParseTopPicString(topPIC);
+ };
+ TopPicParserException ex = Assert.Throws(throwTest);
+ Assert.AreEqual(exMessage, ex.Message);
+ }
+ ///
+ /// Testing Exceptions.
+ ///
+ /// The top pic.
+ [Test]
+ [TestCase(@"Phospho,79.966331,STY,any,21,54", "Failed to parse mod file")]
+ [TestCase(@"Phospho,79.966331,STY,any,2O", "Failed to parse UniMod Id 2O")]
+ [TestCase(@"Phospho,79b.966331,STY,any,-1", "invalid UniMod Id or mass")]
+ [TestCase(@"Phospho,79.966331,STY,any,-5", "invalid UniMod Id or mass")]
+ public void ModFilePArsingExceptionTesting(string modFileString, string exMessage)
+ {
+ MemoryStream stream = new MemoryStream(Encoding.UTF8.GetBytes(modFileString));
+ TestDelegate throwTest = () =>
+ {
+ var topicParser = new TopPicProformaParser(stream);
+ };
+ TopPicParserException ex = Assert.Throws(throwTest);
+ Assert.AreEqual(exMessage, ex.Message);
+ }
\ No newline at end of file
+# This file is used to specify modifications
+# # for comments
+# To input a modification, use the following format:
+# Name,Mass,Residues,Position,UnimodID
+# Name: name of the modification (Unimod PSI-MS name)
+# - The Unimod PSI-MS names are recommended
+# - E.g. Phospho, Acetyl
+# - Visit http://www.unimod.org to get PSI-MS names.
+# Mass: monoisotopic mass of modification.
+# - It is important to specify accurate masses (integer masses are insufficient).
+# - E.g. 15.994915
+# Residues: amino acids that can be modified
+# - Use * if this modification is applicable to all the 20 amino acids.
+# Position: positions in the protein where the modification can be attached.
+# - Only "any" can be used for anywhere
+# UnimodID: unmimod id of the modification
+# - Please use -1, if not in unimod
+# Methionine oxidation
+# Phosphorylation
+# test1
+# test2
+# test3
\ No newline at end of file