Skip to content

Commit

Permalink
Get Modifications from Full Sequence (#796)
Browse files Browse the repository at this point in the history
* started mod methods

* made the test pass

* Removed GetMods after deserialization method in favor of IBiopolymerWithSetMods backing method

* added throws to summary comment and extended mzlibexception to have an inner exception.

* Made inner exception nullable in MzLibException

---------

Co-authored-by: trishorts <mshort@chem.wisc.edu>
  • Loading branch information
nbollis and trishorts authored Oct 30, 2024
1 parent cb08d67 commit b055693
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 67 deletions.
5 changes: 3 additions & 2 deletions mzLib/MzLibUtil/MzLibException.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
using System;
#nullable enable
using System;

namespace MzLibUtil
{
[Serializable]
public class MzLibException(string message, Exception innerException = null)
public class MzLibException(string message, Exception? innerException = null)
: Exception(message, innerException);
}
81 changes: 81 additions & 0 deletions mzLib/Omics/IBioPolymerWithSetMods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,5 +82,86 @@ public static string GetBaseSequenceFromFullSequence(string fullSequence)
}
return sb.ToString();
}

/// <summary>
/// Returns a list of modifications and their OneBased index from a full sequence
/// </summary>
/// <param name="fullSequence">Full sequence</param>
/// <param name="allModsKnown">All known modifications</param>
/// <returns></returns>
/// <exception cref="MzLibUtil.MzLibException">When a full sequence is not in the correct format or a mod is not found in the allModsKnown dictionary</exception>
public static Dictionary<int, Modification> GetModificationDictionaryFromFullSequence(string fullSequence,
Dictionary<string, Modification> allModsKnown)
{
var allModsOneIsNterminus = new Dictionary<int, Modification>();
var baseSequence = GetBaseSequenceFromFullSequence(fullSequence);
int currentModStart = 0;
int currentModificationLocation = 1;
bool currentlyReadingMod = false;
int bracketCount = 0;

for (int r = 0; r < fullSequence.Length; r++)
{
char c = fullSequence[r];
if (c == '[')
{
currentlyReadingMod = true;
if (bracketCount == 0)
{
currentModStart = r + 1;
}
bracketCount++;
}
else if (c == ']')
{
string modId = null;
bracketCount--;
if (bracketCount == 0)
{
try
{
//remove the beginning section (e.g. "Fixed", "Variable", "Uniprot")
string modString = fullSequence.Substring(currentModStart, r - currentModStart);
int splitIndex = modString.IndexOf(':');
string modType = modString.Substring(0, splitIndex);
modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1);
}
catch (Exception e)
{
throw new MzLibUtil.MzLibException(
"Error while trying to parse string into peptide: " + e.Message, e);

}
if (!allModsKnown.TryGetValue(modId, out var mod))
{
throw new MzLibUtil.MzLibException(
"Could not find modification while reading string: " + fullSequence);
}
if (mod.LocationRestriction.Contains("C-terminal.") && r == fullSequence.Length - 1)
{
currentModificationLocation = baseSequence.Length + 2;
}
allModsOneIsNterminus.Add(currentModificationLocation, mod);
currentlyReadingMod = false;
}
}
else if (!currentlyReadingMod)
{
currentModificationLocation++;
}
//else do nothing
}

return allModsOneIsNterminus;
}

/// <summary>
/// Returns a list of modifications from a full sequence
/// </summary>
/// <param name="fullSequence">Full sequence</param>
/// <param name="allModsKnown">All known modifications</param>
/// <returns></returns>
public static List<Modification> GetModificationsFromFullSequence(string fullSequence,
Dictionary<string, Modification> allModsKnown) => [.. GetModificationDictionaryFromFullSequence(fullSequence, allModsKnown).Values];
}
}
4 changes: 2 additions & 2 deletions mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1103,15 +1103,15 @@ private void ParseSequence(string sequence)
{
modification = new OldSchoolChemicalFormulaModification(ChemicalFormula.ParseFormula(modString));
}
catch (MzLibException)
catch (MzLibException e)
{
if (double.TryParse(modString, out double mass))
{
modification = new ModWithOnlyMass(mass);
}
else
{
throw new MzLibException("Unable to correctly parse the following modification: " + modString);
throw new MzLibException("Unable to correctly parse the following modification: " + modString, e);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ public PeptideWithSetModifications(string sequence, Dictionary<string, Modificat

FullSequence = sequence;
_baseSequence = IBioPolymerWithSetMods.GetBaseSequenceFromFullSequence(sequence);
GetModsAfterDeserialization(allKnownMods);
_allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(sequence, allKnownMods);
NumFixedMods = numFixedMods;
_digestionParams = digestionParams as DigestionParams;
PairedTargetDecoySequence = pairedTargetDecoySequence;
Expand Down Expand Up @@ -910,7 +910,7 @@ public override int GetHashCode()
/// </summary>
public void SetNonSerializedPeptideInfo(Dictionary<string, Modification> idToMod, Dictionary<string, Protein> accessionToProtein, DigestionParams dp)
{
GetModsAfterDeserialization(idToMod);
_allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(FullSequence, idToMod);
GetProteinAfterDeserialization(accessionToProtein);
_digestionParams = dp;
}
Expand All @@ -919,66 +919,6 @@ public void SetNonSerializedPeptideInfo(Dictionary<string, Modification> idToMod
Dictionary<string, Protein> accessionToProtein, IDigestionParams dp) =>
SetNonSerializedPeptideInfo(idToMod, accessionToProtein, (DigestionParams)dp);

private void GetModsAfterDeserialization(Dictionary<string, Modification> idToMod)
{
_allModsOneIsNterminus = new Dictionary<int, Modification>();
int currentModStart = 0;
int currentModificationLocation = 1;
bool currentlyReadingMod = false;
int bracketCount = 0;

for (int r = 0; r < FullSequence.Length; r++)
{
char c = FullSequence[r];
if (c == '[')
{
currentlyReadingMod = true;
if (bracketCount == 0)
{
currentModStart = r + 1;
}
bracketCount++;
}
else if (c == ']')
{
string modId = null;
bracketCount--;
if (bracketCount == 0)
{
try
{
//remove the beginning section (e.g. "Fixed", "Variable", "Uniprot")
string modString = FullSequence.Substring(currentModStart, r - currentModStart);
int splitIndex = modString.IndexOf(':');
string modType = modString.Substring(0, splitIndex);
modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1);
}
catch (Exception e)
{
throw new MzLibUtil.MzLibException(
"Error while trying to parse string into peptide: " + e.Message);
}
if (!idToMod.TryGetValue(modId, out Modification mod))
{
throw new MzLibUtil.MzLibException(
"Could not find modification while reading string: " + FullSequence);
}
if (mod.LocationRestriction.Contains("C-terminal.") && r == FullSequence.Length - 1)
{
currentModificationLocation = BaseSequence.Length + 2;
}
_allModsOneIsNterminus.Add(currentModificationLocation, mod);
currentlyReadingMod = false;
}
}
else if (!currentlyReadingMod)
{
currentModificationLocation++;
}
//else do nothing
}
}

private void GetProteinAfterDeserialization(Dictionary<string, Protein> idToProtein)
{
Protein protein = null;
Expand Down
2 changes: 1 addition & 1 deletion mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ public static List<SpectrumMatchFromTsv> ReadTsv(string filePath, out List<strin
}
catch (Exception e)
{
throw new MzLibException("Could not read file: " + e.Message);
throw new MzLibException("Could not read file: " + e.Message, e);
}

int lineCount = 0;
Expand Down
92 changes: 92 additions & 0 deletions mzLib/Test/TestPeptideWithSetMods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Collections.Generic;
using System.IO;
using System.Linq;
using MzLibUtil;
using Omics;
using Omics.Digestion;
using Omics.Fragmentation;
Expand Down Expand Up @@ -1181,5 +1182,96 @@ public static void TestPeptideWithSetModsNoParentProtein()
Assert.AreEqual('-', last.NextAminoAcid);
Assert.AreEqual('-', last.NextResidue);
}

[Test]
public static void TestIBioPolymerWithSetModsModificationFromFullSequence()
{
Dictionary<string, Modification> un = new Dictionary<string, Modification>();
var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(TestContext.CurrentContext.TestDirectory, "PSI-MOD.obo2.xml"));
Dictionary<string, int> formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized);
List<Modification> UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"),
formalChargesDictionary).ToList();
List<Protein> proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "cRAP_databaseGPTMD.xml"),
true, DecoyType.None, UniProtPtms, false, new string[] { "exclude_me" }, out un);
var allKnownModDict = UniProtPtms.ToDictionary(p => p.IdWithMotif, p => p);
var digestionParameters = new DigestionParams(maxModsForPeptides: 3);

foreach (Protein p in proteins)
{
List<PeptideWithSetModifications> digestedPeptides =
p.Digest(digestionParameters, [], [], null, null).ToList();
// take the most modified peptide by base sequence and ensure all methods function properly
foreach (var targetPeptide in digestedPeptides
.Where(pep => pep.FullSequence.Contains('['))
.GroupBy(pep => pep.BaseSequence)
.Select(pepGroup => pepGroup.MaxBy(pep => pep.AllModsOneIsNterminus.Count)))
{
var startResidue = targetPeptide.OneBasedStartResidue;
var endResidue = targetPeptide.OneBasedEndResidue;

// Pull our expected modifications based upon parent protein object with a maximum value of DigestionParameters.MaxMods
// A bunch of logic to count the number of expected modifications based upon the xml database entries
int expectedModCount = 0;
foreach (var modDictEntry in p.OneBasedPossibleLocalizedModifications
.Where(mod => mod.Key >= startResidue && mod.Key <= endResidue))
{
if (modDictEntry.Value.Count > 1)
{
var locRestrictions = modDictEntry.Value.Select(mod => mod.LocationRestriction).ToList();

if (locRestrictions.AllSame())
{
if (locRestrictions.First() == "Anywhere.")
expectedModCount++;
else if (locRestrictions.First() == "N-terminal." && modDictEntry.Key == startResidue)
expectedModCount++;
}
else if (modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("Anywhere.")
&& modDictEntry.Value.Select(mod => mod.LocationRestriction)
.Contains("N-terminal."))
{
expectedModCount++;
if (modDictEntry.Key == startResidue)
expectedModCount++;
}
}
else
{
switch (modDictEntry.Value.First().LocationRestriction)
{
case "Anywhere.":
case "N-terminal." when modDictEntry.Key == startResidue:
expectedModCount++;
break;
}
}
}

expectedModCount = Math.Min(expectedModCount, digestionParameters.MaxMods);

var expectedModifications = p.OneBasedPossibleLocalizedModifications.Where(mod =>
mod.Key >= startResidue &&
mod.Key <= endResidue).SelectMany(mod => mod.Value).ToList();

// Parse modifications from PWSM and two IBioPolymerWithSetMods methods
var pwsmModDict = targetPeptide.AllModsOneIsNterminus;
var bpwsmModDict = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(targetPeptide.FullSequence, allKnownModDict);
var bpwsmModList = IBioPolymerWithSetMods.GetModificationsFromFullSequence(targetPeptide.FullSequence, allKnownModDict);

// Ensure all methods are in agreement by modification count
Assert.AreEqual(pwsmModDict.Count, expectedModCount);
Assert.AreEqual(bpwsmModDict.Count, expectedModCount);
Assert.AreEqual(bpwsmModList.Count, expectedModCount);

// Ensure all methods are in agreement by modification identify
foreach (var pwsmModification in pwsmModDict.Values)
Assert.Contains(pwsmModification, expectedModifications);
foreach (var pwsmModification in bpwsmModDict.Values)
Assert.Contains(pwsmModification, expectedModifications);
foreach (var pwsmModification in bpwsmModList)
Assert.Contains(pwsmModification, expectedModifications);
}
}
}
}
}

0 comments on commit b055693

Please sign in to comment.