Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TopPic Proforma Parser #113

Merged
merged 13 commits into from
Feb 14, 2024
216 changes: 216 additions & 0 deletions src/TopDownProteomics/ProForma/TopPicProformaParser.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;

namespace TopDownProteomics.ProForma;

/// <summary>
/// A parser for TopPIC strings into a ProformaTerm <see cref="ProFormaTerm"/>
/// </summary>
public class TopPicProformaParser
{
IDictionary<string, IList<ProFormaDescriptor>>? _modLookup = null;

#region Regex strings
Regex _modRx = new(@"\(([A-Z]{1,})\)(\[.+?\])+");
Regex _numberRx = new(@"(-?\+?[0-9]+.[0-9]+)");
Regex _terminalAaRx = new(@"\P{N}(\.)\P{N}??|\P{N}??(\.)\P{N}");
Regex _strippedSequenceRx = new(@"\[.+?\]|[()]");
#endregion

/// <summary>
/// Initializes a new instance of the <see cref="TopPicProformaParser"/> class.
/// </summary>
public TopPicProformaParser() { }

/// <summary>
/// Initializes a new instance of the <see cref="TopPicProformaParser"/> class.
/// </summary>
/// <param name="modFile">The mod.txt file for mapping modifications.</param>
public TopPicProformaParser(string modFile)
{
_modLookup = ParseModFile(new FileInfo(modFile).OpenRead()); ;
}

/// <summary>
/// Initializes a new instance of the <see cref="TopPicProformaParser"/> class.
/// </summary>
/// <param name="modStream">The mod stream.</param>
public TopPicProformaParser(Stream modStream)
{
_modLookup = ParseModFile(modStream);
}

Check warning on line 44 in src/TopDownProteomics/ProForma/TopPicProformaParser.cs

View check run for this annotation

Codecov / codecov/patch

src/TopDownProteomics/ProForma/TopPicProformaParser.cs#L44

Added line #L44 was not covered by tests

/// <summary>
/// Gets the proforma term.
/// </summary>
/// <param name="sequence">The sequence.</param>
/// <returns></returns>
public ProFormaTerm ParseTopPicString(string sequence)
{
//first remove terminal AA tags if there!
sequence = RemoveTerminalAAs(sequence);
var (nTerms, cTerms, tags) = FindPTMs(sequence);
return new ProFormaTerm(GetFullyStrippedSequence(sequence), tags, nTerms, cTerms);
}

private IDictionary<string, IList<ProFormaDescriptor>> ParseModFile(Stream modStream)
{
IDictionary<string, IList<ProFormaDescriptor>> modLookup = new Dictionary<string, IList<ProFormaDescriptor>>();

using StreamReader reader = new StreamReader(modStream);

while (!reader.EndOfStream)
{
var line = reader.ReadLine();

if (string.IsNullOrWhiteSpace(line) | line.StartsWith("#"))
continue;

//# To input a modification, use the following format:
//# Name,Mass,Residues,Position,UnimodID
var splitLine = line.Split(',');

if (splitLine.Length != 5)
throw new TopPicParserException("Failed to parse mod file");

mikehollas123 marked this conversation as resolved.
Show resolved Hide resolved
var name = splitLine[0];

if (int.TryParse(splitLine[4], out var uniModNumber))
{
if (uniModNumber > 0)
modLookup.Add(name, new List<ProFormaDescriptor>()
{
new ProFormaDescriptor(ProFormaKey.Identifier, ProFormaEvidenceType.Unimod, $"UNIMOD:{uniModNumber}"),
new ProFormaDescriptor(ProFormaKey.Info, name)
});
else if (uniModNumber == -1 && double.TryParse(splitLine[1], out var mass))
modLookup.Add(name, new List<ProFormaDescriptor>()
{
new ProFormaDescriptor(ProFormaKey.Mass, $"{mass:+#.000000;-#.000000}"),
new ProFormaDescriptor(ProFormaKey.Info, name)
});
else
throw new TopPicParserException($"invalid UniMod Id or mass");
}
else
throw new TopPicParserException($"Failed to parse UniMod Id {splitLine[4]}".Trim());
}
return modLookup;
}

mikehollas123 marked this conversation as resolved.
Show resolved Hide resolved
private string GetFullyStrippedSequence(string sequence) => _strippedSequenceRx.Replace(sequence, "");

private Dictionary<int, int> GetIndexLookup(string sequence)
{
Dictionary<int, int> indexLookup = new Dictionary<int, int>();

bool inBracket = false;
int index = 0;
for (int i = 0; i < sequence.Length; i++)
{
char c = sequence[i];
if (c == '[')
inBracket = true;
else if (c == ']')
inBracket = false;
else if (char.IsUpper(c) && !inBracket)
{
indexLookup[i] = index++;
}
}
return indexLookup;
}

private Tuple<IList<ProFormaDescriptor>, IList<ProFormaDescriptor>, IList<ProFormaTag>> FindPTMs(string sequence)
{
var indexLookup = GetIndexLookup(sequence);

List<ProFormaDescriptor> nTerms = new List<ProFormaDescriptor>();
List<ProFormaDescriptor> cTerms = new List<ProFormaDescriptor>();
List<ProFormaTag> tags = new List<ProFormaTag>();

foreach (Match match in _modRx.Matches(sequence))
{
var startIndex = indexLookup[match.Groups[1].Index];
var ptms = match.Groups[2].Captures;

if (ptms.Count > 1)
throw new TopPicParserException("multiple mods are not currently supported");

if (startIndex == 0 && match.Groups[1].Length == 1) // check for ambiguous mods that include the start -> just make tags
{
nTerms = ParsePtms(ptms);
}
else if (startIndex == indexLookup.Max(x => x.Value))
{
cTerms = ParsePtms(ptms);
}
else if (match.Groups[1].Length > 1)
{
var EndIndex = startIndex + match.Groups[1].Length - 1;
tags.Add(new ProFormaTag(startIndex, EndIndex, ParsePtms(ptms)));
}
else
tags.Add(new ProFormaTag(startIndex, ParsePtms(ptms)));
}
return new Tuple<IList<ProFormaDescriptor>, IList<ProFormaDescriptor>, IList<ProFormaTag>>(nTerms, cTerms, tags);
}

private List<ProFormaDescriptor> ParsePtms(CaptureCollection ptms)
{
var proformaList = new List<ProFormaDescriptor>();

foreach (var ptm in ptms)
proformaList.AddRange(ParsePtmString(ptm.ToString()));

return proformaList;
}

private IList<ProFormaDescriptor> ParsePtmString(string ptmString)
{
//strip []
ptmString = ptmString.Substring(1, ptmString.Length - 2);
var numberMatch = _numberRx.Match(ptmString);

if (numberMatch.Success && Double.TryParse(numberMatch.Value, out double val))
return new List<ProFormaDescriptor>() { new ProFormaDescriptor(ProFormaKey.Mass, $"{val:+#.0000;-#.0000;0}")};

// Find and throw exception if there is a *
if (ptmString.Contains('*'))
throw new TopPicParserException("multiple mods are not currently supported");

if (_modLookup?.ContainsKey(ptmString) == true)
return _modLookup[ptmString];
else
return new List<ProFormaDescriptor>() { new ProFormaDescriptor(ptmString) };
}

private string RemoveTerminalAAs(string sequence)
{
var matches = _terminalAaRx.Matches(sequence);

if (matches.Count > 0)
{
var startIndex = matches[0].Groups[1].Index + 1;
var length = matches[1].Groups[1].Index - startIndex;
sequence = sequence.Substring(startIndex, length);
}
return sequence;
}
}

/// <summary>
/// An exception for the TopPIC to ProForma parser.
/// </summary>
/// <seealso cref="System.Exception" />
public class TopPicParserException : Exception
{
/// <summary>
/// Initializes a new instance of the <see cref="TopPicParserException"/> class.
/// </summary>
/// <param name="message">The message that describes the error.</param>
public TopPicParserException(string message) : base(message) { }
}
100 changes: 100 additions & 0 deletions tests/TopDownProteomics.Tests/ProForma/ToPicParserTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
using NUnit.Framework;
using System.IO;
using System.Text;
using TopDownProteomics.ProForma;

namespace TopDownProteomics.Tests.ProForma;

/// <summary>
/// Tests for the TopPicProformaParser <see cref="TopPicProformaParser"/>
/// </summary>
[TestFixture]
public class ToPicParserTests
{
private static string GetTestDataFile(string name) => Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", name);

/// <summary>
/// Tests the TopPic Proforma Parser.
/// </summary>
[Test]
[TestCase("M.A(AAA)[Phospho]AAA.C", "A(AAA)[UNIMOD:21|Info:Phospho]AAA")]
[TestCase("W.(G)[Oxidation]DGCAQKNKPGVYTK(V)[Phospho]YNYVKWIKNTIAANS.", "[UNIMOD:35|Info:Oxidation]-GDGCAQKNKPGVYTKV[UNIMOD:21|Info:Phospho]YNYVKWIKNTIAANS")]
[TestCase(".GDGCAQKNKPGVYTK(V)[Phospho]YNYVKWIKNTIAANS.", "GDGCAQKNKPGVYTKV[UNIMOD:21|Info:Phospho]YNYVKWIKNTIAANS")]
[TestCase("W.GDGCAQKNKPGVYTKVYNYVKWIKNTIAAN(S)[Phospho].", "GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS-[UNIMOD:21|Info:Phospho]")]
[TestCase("W.GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")]
[TestCase(".(G)[Test1]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+59.000000|Info:Test1]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")]
[TestCase("W.(G)[T@s!1]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[T@s!1]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")]
[TestCase("W.(G)[Test_2]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+59.000000|Info:Test_2]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")]
[TestCase(".(G)[Ox_plus1]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+17.123000|Info:Ox_plus1]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")]
[TestCase(".(G)[+23.9987]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+23.9987]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")]
[TestCase(".(G)[23.9987]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[+23.9987]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")]
[TestCase(".(G)[-23.9987]DGCAQKNKPGVYTKVYNYVKWIKNTIAANS.", "[-23.9987]-GDGCAQKNKPGVYTKVYNYVKWIKNTIAANS")]
public void CompareToProForma(string topPIC, string proForma)
{
var topicParser = new TopPicProformaParser(GetTestDataFile("topPicTestMods.txt"));
var term = topicParser.ParseTopPicString(topPIC);

var writer = new ProFormaWriter();

Assert.AreEqual(proForma, writer.WriteString(term));
}

mikehollas123 marked this conversation as resolved.
Show resolved Hide resolved
/// <summary>
/// Tests the TopPic Proforma Parser with no mod file.
/// </summary>
[Test]
[TestCase("M.A(AAA)[Phospho]AAA.C", "A(AAA)[Phospho]AAA")]
[TestCase("W.(G)[Oxidation]DGCAQKNKPGVYTK(V)[Phospho]YNYVKWIKNTIAANS.", "[Oxidation]-GDGCAQKNKPGVYTKV[Phospho]YNYVKWIKNTIAANS")]
[TestCase("W.(G)[asdf4fdfsd6!]DGCAQKNKPGVYTKYNYVKWIKNTIAANS.", "[asdf4fdfsd6!]-GDGCAQKNKPGVYTKYNYVKWIKNTIAANS")]
public void CompareToProFormaNoModFile(string topPIC, string proForma)
{
var topicParser = new TopPicProformaParser();
var term = topicParser.ParseTopPicString(topPIC);

var writer = new ProFormaWriter();

Assert.AreEqual(proForma, writer.WriteString(term));
}

/// <summary>
/// Testing Exceptions.
/// </summary>
/// <param name="topPIC">The top pic.</param>
[Test]
[TestCase("M.A(AAA)[Phospho*4]AAA.C", "multiple mods are not currently supported")]
[TestCase("M.A(AAA)[Phospho][Phospho]AAA.C", "multiple mods are not currently supported")]
public void ParsingExceptionTesting(string topPIC, string exMessage)
{
var topicParser = new TopPicProformaParser(GetTestDataFile("topPicTestMods.txt"));

TestDelegate throwTest = () =>
{
var term = topicParser.ParseTopPicString(topPIC);
};

TopPicParserException ex = Assert.Throws<TopPicParserException>(throwTest);
Assert.AreEqual(exMessage, ex.Message);
}

/// <summary>
/// Testing Exceptions.
/// </summary>
/// <param name="topPIC">The top pic.</param>
[Test]
[TestCase(@"Phospho,79.966331,STY,any,21,54", "Failed to parse mod file")]
[TestCase(@"Phospho,79.966331,STY,any,2O", "Failed to parse UniMod Id 2O")]
[TestCase(@"Phospho,79b.966331,STY,any,-1", "invalid UniMod Id or mass")]
[TestCase(@"Phospho,79.966331,STY,any,-5", "invalid UniMod Id or mass")]
public void ModFilePArsingExceptionTesting(string modFileString, string exMessage)
{
MemoryStream stream = new MemoryStream(Encoding.UTF8.GetBytes(modFileString));

TestDelegate throwTest = () =>
{
var topicParser = new TopPicProformaParser(stream);
};

TopPicParserException ex = Assert.Throws<TopPicParserException>(throwTest);
Assert.AreEqual(exMessage, ex.Message);
}
}
38 changes: 38 additions & 0 deletions tests/TopDownProteomics.Tests/TestData/topPicTestMods.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# This file is used to specify modifications
# # for comments
# To input a modification, use the following format:
#
# Name,Mass,Residues,Position,UnimodID
#
# Name: name of the modification (Unimod PSI-MS name)
# - The Unimod PSI-MS names are recommended
# - E.g. Phospho, Acetyl
# - Visit http://www.unimod.org to get PSI-MS names.
#
# Mass: monoisotopic mass of modification.
# - It is important to specify accurate masses (integer masses are insufficient).
# - E.g. 15.994915
#
# Residues: amino acids that can be modified
# - Use * if this modification is applicable to all the 20 amino acids.
#
# Position: positions in the protein where the modification can be attached.
# - Only "any" can be used for anywhere
#
# UnimodID: unmimod id of the modification
# - Please use -1, if not in unimod

# Methionine oxidation
Oxidation,15.994915,M,any,35

# Phosphorylation
Phospho,79.966331,STY,any,21

# test1
Test1,59.0000,STY,any,-1

# test2
Test_2,59.0000,STY,any,-1

# test3
Ox_plus1,17.1230,STY,any,-1
Loading