From 8620b8facfb1ec0406f4100820e744ced6abf648 Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 18:32:18 -0600 Subject: [PATCH 01/21] Added bassic object pools --- mzLib/MzLibUtil/MzLibUtil.csproj | 1 + mzLib/MzLibUtil/MzLibUtil.csproj.DotSettings | 2 + mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs | 58 +++++++++ mzLib/MzLibUtil/ObjectPools/HashSetPool.cs | 60 +++++++++ mzLib/MzLibUtil/ObjectPools/ListPool.cs | 56 ++++++++ mzLib/Test/ObjectPoolTests.cs | 120 ++++++++++++++++++ mzLib/mzLib.nuspec | 2 + 7 files changed, 299 insertions(+) create mode 100644 mzLib/MzLibUtil/MzLibUtil.csproj.DotSettings create mode 100644 mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs create mode 100644 mzLib/MzLibUtil/ObjectPools/HashSetPool.cs create mode 100644 mzLib/MzLibUtil/ObjectPools/ListPool.cs create mode 100644 mzLib/Test/ObjectPoolTests.cs diff --git a/mzLib/MzLibUtil/MzLibUtil.csproj b/mzLib/MzLibUtil/MzLibUtil.csproj index c6b5cf526..ae8fef5ea 100644 --- a/mzLib/MzLibUtil/MzLibUtil.csproj +++ b/mzLib/MzLibUtil/MzLibUtil.csproj @@ -14,6 +14,7 @@ + diff --git a/mzLib/MzLibUtil/MzLibUtil.csproj.DotSettings b/mzLib/MzLibUtil/MzLibUtil.csproj.DotSettings new file mode 100644 index 000000000..52f9c2892 --- /dev/null +++ b/mzLib/MzLibUtil/MzLibUtil.csproj.DotSettings @@ -0,0 +1,2 @@ + + True \ No newline at end of file diff --git a/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs b/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs new file mode 100644 index 000000000..ca2f2f712 --- /dev/null +++ b/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs @@ -0,0 +1,58 @@ +using System; +using System.Collections.Generic; +using Microsoft.Extensions.ObjectPool; + +namespace MzLibUtil; + +// Used to pool HashSet instances to reduce memory allocations +public class DictionaryPool where TKey : notnull +{ + private readonly ObjectPool> _pool; + + /// + /// Initializes a new instance of the class. + /// + /// Initial capacity for the pooled Dictionary instances. + public DictionaryPool(int initialCapacity = 16) + { + var policy = new DictionaryPooledObjectPolicy(initialCapacity); + var provider = new DefaultObjectPoolProvider { MaximumRetained = Environment.ProcessorCount * 2 }; + _pool = provider.Create(policy); + } + + /// + /// Retrieves a Dictionary instance from the pool. + /// + /// A Dictionary instance. + public Dictionary Get() => _pool.Get(); + + /// + /// Returns a Dictionary instance back to the pool. + /// + /// The Dictionary instance to return. + public void Return(Dictionary dictionary) + { + if (dictionary == null) throw new ArgumentNullException(nameof(dictionary)); + dictionary.Clear(); // Ensure the Dictionary is clean before returning it to the pool + _pool.Return(dictionary); + } + + private class DictionaryPooledObjectPolicy(int initialCapacity) + : PooledObjectPolicy> + where TKeyItem : notnull + { + private int InitialCapacity { get; } = initialCapacity; + + public override Dictionary Create() + { + return new Dictionary(capacity: InitialCapacity); + } + + public override bool Return(Dictionary obj) + { + // Ensure the Dictionary can be safely reused + obj.Clear(); + return true; + } + } +} \ No newline at end of file diff --git a/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs b/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs new file mode 100644 index 000000000..ef3e152d1 --- /dev/null +++ b/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs @@ -0,0 +1,60 @@ +using System; +using System.Collections.Generic; +using Microsoft.Extensions.ObjectPool; + +namespace MzLibUtil; + +// Example Usage: +// var pool = new HashSetPool(); +// var hashSet = pool.Get(); +// hashSet.Add(1); +// Do Work +// pool.Return(hashSet); + +// Used to pool HashSet instances to reduce memory allocations +public class HashSetPool +{ + private readonly ObjectPool> _pool; + + /// + /// Initializes a new instance of the class. + /// + /// Initial capacity for the pooled HashSet instances. + public HashSetPool(int initialCapacity = 16) + { + var policy = new HashSetPooledObjectPolicy(initialCapacity); + _pool = new DefaultObjectPool>(policy); + } + + /// + /// Retrieves a HashSet instance from the pool. + /// + /// A HashSet instance. + public HashSet Get() => _pool.Get(); + + /// + /// Returns a HashSet instance back to the pool. + /// + /// The HashSet instance to return. + public void Return(HashSet hashSet) + { + if (hashSet == null) throw new ArgumentNullException(nameof(hashSet)); + hashSet.Clear(); // Ensure the HashSet is clean before returning it to the pool + _pool.Return(hashSet); + } + + private class HashSetPooledObjectPolicy(int initialCapacity) : PooledObjectPolicy> + { + public override HashSet Create() + { + return new HashSet(capacity: initialCapacity); + } + + public override bool Return(HashSet obj) + { + // Ensure the HashSet can be safely reused + obj.Clear(); + return true; + } + } +} \ No newline at end of file diff --git a/mzLib/MzLibUtil/ObjectPools/ListPool.cs b/mzLib/MzLibUtil/ObjectPools/ListPool.cs new file mode 100644 index 000000000..310369b90 --- /dev/null +++ b/mzLib/MzLibUtil/ObjectPools/ListPool.cs @@ -0,0 +1,56 @@ +using System; +using System.Collections.Generic; +using Microsoft.Extensions.ObjectPool; + +namespace MzLibUtil; + +// Used to pool HashSet instances to reduce memory allocations +public class ListPool +{ + private readonly ObjectPool> _pool; + + /// + /// Initializes a new instance of the class. + /// + /// Initial capacity for the pooled HashSet instances. + public ListPool(int initialCapacity = 16) + { + var policy = new ListPooledObjectPolicy(initialCapacity); + var provider = new DefaultObjectPoolProvider { MaximumRetained = Environment.ProcessorCount * 2 }; + _pool = provider.Create(policy); + } + + /// + /// Retrieves a HashSet instance from the pool. + /// + /// A HashSet instance. + public List Get() => _pool.Get(); + + /// + /// Returns a HashSet instance back to the pool. + /// + /// The HashSet instance to return. + public void Return(List list) + { + if (list == null) throw new ArgumentNullException(nameof(list)); + list.Clear(); // Ensure the HashSet is clean before returning it to the pool + _pool.Return(list); + } + + private class ListPooledObjectPolicy(int initialCapacity) : PooledObjectPolicy> + { + private int InitialCapacity { get; } = initialCapacity; + + public override List Create() + { + return new List(capacity: InitialCapacity); + } + + public override bool Return(List obj) + { + // Ensure the HashSet can be safely reused + obj.Clear(); + return true; + } + } +} \ No newline at end of file diff --git a/mzLib/Test/ObjectPoolTests.cs b/mzLib/Test/ObjectPoolTests.cs new file mode 100644 index 000000000..bac0600eb --- /dev/null +++ b/mzLib/Test/ObjectPoolTests.cs @@ -0,0 +1,120 @@ +using MzLibUtil; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +namespace Test; + +[TestFixture] +[ExcludeFromCodeCoverage] +public class HashSetPoolTests +{ + [Test] + public void Get_ReturnsHashSetInstance() + { + var pool = new HashSetPool(); + var hashSet = pool.Get(); + Assert.That(hashSet, Is.Not.Null); + pool.Return(hashSet); + } + + [Test] + public void Return_ClearsHashSetBeforeReturningToPool() + { + var pool = new HashSetPool(); + var hashSet = pool.Get(); + hashSet.Add(1); + pool.Return(hashSet); + Assert.That(hashSet.Count, Is.EqualTo(0)); + } + + [Test] + public void Return_ThrowsArgumentNullException_WhenHashSetIsNull() + { + var pool = new HashSetPool(); + Assert.Throws(() => pool.Return(null)); + } +} + +[TestFixture] +[ExcludeFromCodeCoverage] +public class DictionaryPoolTests +{ + [Test] + public void Get_ReturnsDictionaryInstance() + { + var dictionaryPool = new DictionaryPool(); + var dictionary = dictionaryPool.Get(); + Assert.That(dictionary, Is.Not.Null); + Assert.That(dictionary, Is.InstanceOf>()); + } + + [Test] + public void Return_ClearsAndReturnsDictionaryToPool() + { + var dictionaryPool = new DictionaryPool(); + var dictionary = dictionaryPool.Get(); + dictionary["key"] = 42; + + dictionaryPool.Return(dictionary); + + Assert.That(dictionary.Count, Is.EqualTo(0)); + } + + [Test] + public void Return_ThrowsArgumentNullException_WhenDictionaryIsNull() + { + var dictionaryPool = new DictionaryPool(); + Assert.Throws(() => dictionaryPool.Return(null)); + } +} + +[TestFixture] +[ExcludeFromCodeCoverage] +public class ListPoolTests +{ + [Test] + public void ListPool_Get_ReturnsListWithInitialCapacity() + { + // Arrange + int initialCapacity = 16; + var listPool = new ListPool(initialCapacity); + + // Act + var list = listPool.Get(); + + // Assert + Assert.That(list, Is.Not.Null); + Assert.That(list.Capacity, Is.EqualTo(initialCapacity)); + } + + [Test] + public void ListPool_Return_ClearsListBeforeReturningToPool() + { + // Arrange + var listPool = new ListPool(); + var list = listPool.Get(); + list.Add(1); + list.Add(2); + + // Act + listPool.Return(list); + var returnedList = listPool.Get(); + + // Assert + Assert.That(returnedList, Is.Not.Null); + Assert.That(returnedList, Is.Empty); + } + + [Test] + public void ListPool_Return_ThrowsArgumentNullException_WhenListIsNull() + { + // Arrange + var listPool = new ListPool(); + + // Act & Assert + Assert.That(() => listPool.Return(null), Throws.ArgumentNullException); + } +} + diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 3aa393afe..8cee705ec 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -23,6 +23,7 @@ + @@ -36,6 +37,7 @@ + From be82276dd0c7a04fa4b10febdc8c677f50a3f37e Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 18:42:33 -0600 Subject: [PATCH 02/21] Refactor DigestionAgent to use HashSetPool for indices Added `using` directive for `MzLibUtil`. Introduced a static readonly `HashSetPool` named `HashSetPool` to manage a pool of hash sets. Updated `DigestionAgent` constructor to initialize `HashSetPool`. Refactored `GetDigestionSiteIndices` to use a hash set from `HashSetPool` for storing indices, ensuring no duplicates. Explicitly added start and end of protein sequence as cleavage sites. Implemented `try-finally` block to return hash set to pool after use. Final list of indices is now sorted before returning. --- mzLib/Omics/Digestion/DigestionAgent.cs | 57 +++++++++++++++---------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionAgent.cs b/mzLib/Omics/Digestion/DigestionAgent.cs index 88794de06..9d9d6083b 100644 --- a/mzLib/Omics/Digestion/DigestionAgent.cs +++ b/mzLib/Omics/Digestion/DigestionAgent.cs @@ -1,9 +1,12 @@ -using Omics.Modifications; +using MzLibUtil; +using Omics.Modifications; namespace Omics.Digestion { public abstract class DigestionAgent { + protected static readonly HashSetPool HashSetPool = new HashSetPool(8); + protected DigestionAgent(string name, CleavageSpecificity cleavageSpecificity, List motifList, Modification cleavageMod) { Name = name; @@ -73,40 +76,50 @@ protected static bool ValidMaxLength(int? length, int maxLength) /// public List GetDigestionSiteIndices(string sequence) { - var indices = new List(); + List? indicesList; + var indices = HashSetPool.Get(); // use hash set to ensure no duplicates - for (int r = 0; r < sequence.Length; r++) + try { - var cutSiteIndex = -1; - bool cleavagePrevented = false; + indices.Add(0); // The start of the protein is treated as a cleavage site to retain the n-terminal peptide - foreach (DigestionMotif motif in DigestionMotifs) + for (int r = 0; r < sequence.Length; r++) { - var motifResults = motif.Fits(sequence, r); - bool motifFits = motifResults.Item1; - bool motifPreventsCleavage = motifResults.Item2; + var cutSiteIndex = -1; + bool cleavagePrevented = false; - if (motifFits && r + motif.CutIndex < sequence.Length) + foreach (DigestionMotif motif in DigestionMotifs) { - cutSiteIndex = Math.Max(r + motif.CutIndex, cutSiteIndex); + var motifResults = motif.Fits(sequence, r); + bool motifFits = motifResults.Item1; + bool motifPreventsCleavage = motifResults.Item2; + + if (motifFits && r + motif.CutIndex < sequence.Length) + { + cutSiteIndex = Math.Max(r + motif.CutIndex, cutSiteIndex); + } + + if (motifPreventsCleavage) // if any motif prevents cleave + { + cleavagePrevented = true; + } } - if (motifPreventsCleavage) // if any motif prevents cleave + // if no motif prevents cleave + if (!cleavagePrevented && cutSiteIndex != -1) { - cleavagePrevented = true; + indices.Add(cutSiteIndex); } } - // if no motif prevents cleave - if (!cleavagePrevented && cutSiteIndex != -1) - { - indices.Add(cutSiteIndex); - } + indices.Add(sequence.Length); // The end of the protein is treated as a cleavage site to retain the c-terminal peptide } - - indices.Add(0); // The start of the protein is treated as a cleavage site to retain the n-terminal peptide - indices.Add(sequence.Length); // The end of the protein is treated as a cleavage site to retain the c-terminal peptide - return indices.Distinct().OrderBy(i => i).ToList(); + finally + { + indicesList = indices.ToList(); + HashSetPool.Return(indices); + } + return indicesList; } } } From 2bfde710f661c69ff0576876e47adc19f0c36aea Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 18:48:37 -0600 Subject: [PATCH 03/21] string interpolation in BPWSM extensions --- .../Omics/BioPolymerWithSetModsExtensions.cs | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs index 2e5d29718..20d0e7abe 100644 --- a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs +++ b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs @@ -18,9 +18,9 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS var subsequence = new StringBuilder(); // modification on peptide N-terminus - if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) + if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification? mod)) { - subsequence.Append('[' + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } for (int r = 0; r < withSetMods.Length; r++) @@ -32,11 +32,11 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS { if (mod.MonoisotopicMass > 0) { - subsequence.Append("[+" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[+{mod.MonoisotopicMass.RoundedDouble(6)}]"); } else { - subsequence.Append("[" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } } } @@ -46,11 +46,11 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS { if (mod.MonoisotopicMass > 0) { - subsequence.Append("[+" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[+{mod.MonoisotopicMass.RoundedDouble(6)}]"); } else { - subsequence.Append("[" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } } return subsequence.ToString(); @@ -68,14 +68,15 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, string essentialSequence = withSetMods.BaseSequence; if (modstoWritePruned != null) { - var sbsequence = new StringBuilder(); + var sbsequence = new StringBuilder(withSetMods.FullSequence.Length); // variable modification on peptide N-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification pep_n_term_variable_mod)) { if (modstoWritePruned.ContainsKey(pep_n_term_variable_mod.ModificationType)) { - sbsequence.Append('[' + pep_n_term_variable_mod.ModificationType + ":" + pep_n_term_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{pep_n_term_variable_mod.ModificationType}:{pep_n_term_variable_mod.IdWithMotif}]"); } } for (int r = 0; r < withSetMods.Length; r++) @@ -86,7 +87,8 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, { if (modstoWritePruned.ContainsKey(residue_variable_mod.ModificationType)) { - sbsequence.Append('[' + residue_variable_mod.ModificationType + ":" + residue_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{residue_variable_mod.ModificationType}:{residue_variable_mod.IdWithMotif}]"); } } } @@ -96,7 +98,8 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, { if (modstoWritePruned.ContainsKey(pep_c_term_variable_mod.ModificationType)) { - sbsequence.Append('[' + pep_c_term_variable_mod.ModificationType + ":" + pep_c_term_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{pep_c_term_variable_mod.ModificationType}:{pep_c_term_variable_mod.IdWithMotif}]"); } } @@ -112,12 +115,13 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, /// public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMods) { - var subSequence = new StringBuilder(); + // start string builder with initial capacity to avoid resizing costs. + var subSequence = new StringBuilder(withSetMods.BaseSequence.Length + withSetMods.AllModsOneIsNterminus.Count * 30); // modification on peptide N-terminus - if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) + if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification? mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } for (int r = 0; r < withSetMods.Length; r++) @@ -127,14 +131,14 @@ public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMo // modification on this residue if (withSetMods.AllModsOneIsNterminus.TryGetValue(r + 2, out mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } } // modification on peptide C-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(withSetMods.Length + 2, out mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } return subSequence.ToString(); From 11ccba97360a64d0137efa2aaf0ccb3cc3fffa1a Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 19:27:10 -0600 Subject: [PATCH 04/21] Adjusted IEnumerable return in Protease.GetUnmodified --- .../ProteolyticDigestion/Protease.cs | 110 +++++++++--------- 1 file changed, 54 insertions(+), 56 deletions(-) diff --git a/mzLib/Proteomics/ProteolyticDigestion/Protease.cs b/mzLib/Proteomics/ProteolyticDigestion/Protease.cs index 3274e8884..9ad375330 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/Protease.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/Protease.cs @@ -76,69 +76,27 @@ public CleavageSpecificity GetCleavageSpecificity(Protein protein, int startInde /// /// /// - internal List GetUnmodifiedPeptides(Protein protein, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, + internal IEnumerable GetUnmodifiedPeptides(Protein protein, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, int minPeptideLength, int maxPeptideLength, Protease specificProtease, bool topDownTruncationSearch = false) { - List peptides = new List(); - - // proteolytic cleavage in one spot (N) - if (CleavageSpecificity == CleavageSpecificity.SingleN) + return CleavageSpecificity switch { - peptides = SingleN_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease); - } + // proteolytic cleavage in one spot (N) + CleavageSpecificity.SingleN => SingleN_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease), - // proteolytic cleavage in one spot (C) - else if (CleavageSpecificity == CleavageSpecificity.SingleC) - { - peptides = SingleC_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease); - } + // proteolytic cleavage in one spot (C) + CleavageSpecificity.SingleC => SingleC_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease), - //top-down - else if (CleavageSpecificity == CleavageSpecificity.None) - { - if (!topDownTruncationSearch)//standard top-down - { - // retain methionine - if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Cleave || protein[0] != 'M') - && ValidLength(protein.Length, minPeptideLength, maxPeptideLength)) - { - peptides.Add(new ProteolyticPeptide(protein, 1, protein.Length, 0, CleavageSpecificity.Full, "full")); - } + //top-down + CleavageSpecificity.None => TopDownDigestion(protein, initiatorMethionineBehavior, minPeptideLength, maxPeptideLength, topDownTruncationSearch), - // cleave methionine - if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Retain && protein[0] == 'M') - && ValidLength(protein.Length - 1, minPeptideLength, maxPeptideLength)) - { - peptides.Add(new ProteolyticPeptide(protein, 2, protein.Length, 0, CleavageSpecificity.Full, "full:M cleaved")); - } - } + // Full proteolytic cleavage + CleavageSpecificity.Full => FullDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength), - // Also digest using the proteolysis product start/end indices - peptides.AddRange( - protein.ProteolysisProducts - .Where(proteolysisProduct => proteolysisProduct.OneBasedEndPosition.HasValue && proteolysisProduct.OneBasedBeginPosition.HasValue - && ValidLength(proteolysisProduct.OneBasedEndPosition.Value - proteolysisProduct.OneBasedBeginPosition.Value + 1, minPeptideLength, maxPeptideLength)) - .Select(proteolysisProduct => - new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, 0, CleavageSpecificity.None, proteolysisProduct.Type))); - } - - // Full proteolytic cleavage - else if (CleavageSpecificity == CleavageSpecificity.Full) - { - peptides.AddRange(FullDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength)); - } - - // Cleavage rules for semi-specific search - else if (CleavageSpecificity == CleavageSpecificity.Semi) - { - peptides.AddRange(SemiProteolyticDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength)); - } - else - { - throw new NotImplementedException(); - } - - return peptides; + // Cleavage rules for semi-specific search + CleavageSpecificity.Semi => SemiProteolyticDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength), + _ => throw new NotImplementedException() + }; } /// @@ -281,6 +239,46 @@ private IEnumerable FullDigestion(Protein protein, Initiator } } + /// + /// Gets protein intervals for top-down digestion. + /// + /// + /// + /// + /// + /// + /// + private IEnumerable TopDownDigestion(Protein protein, InitiatorMethionineBehavior initiatorMethionineBehavior, + int minPeptideLength, int maxPeptideLength, bool topDownTruncationSearch) + { + if (!topDownTruncationSearch) // standard top-down + { + // retain methionine + if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Cleave || protein[0] != 'M') + && ValidLength(protein.Length, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, 1, protein.Length, 0, CleavageSpecificity.Full, "full"); + } + + // cleave methionine + if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Retain && protein[0] == 'M') + && ValidLength(protein.Length - 1, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, 2, protein.Length, 0, CleavageSpecificity.Full, "full:M cleaved"); + } + } + + // Also digest using the proteolysis product start/end indices + foreach (var proteolysisProduct in protein.ProteolysisProducts) + { + if (proteolysisProduct.OneBasedEndPosition.HasValue && proteolysisProduct.OneBasedBeginPosition.HasValue + && ValidLength(proteolysisProduct.OneBasedEndPosition.Value - proteolysisProduct.OneBasedBeginPosition.Value + 1, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, 0, CleavageSpecificity.None, proteolysisProduct.Type); + } + } + } + /// /// Gets the protein intervals based on semiSpecific digestion rules /// This is the classic, slow semi-specific digestion that generates each semi-specific peptide pre-search From 72c9de6d36a288fb5b1976171a95c74d16a9889c Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 20:48:16 -0600 Subject: [PATCH 05/21] Digestion Optimizations --- mzLib/Omics/Digestion/DigestionProduct.cs | 58 +++---- .../Modifications/ModificationLocalization.cs | 30 +++- .../ProteolyticPeptide.cs | 164 +++++++++--------- 3 files changed, 129 insertions(+), 123 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index 55aed3255..9a5862497 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -1,10 +1,4 @@ -using System; -using System.Collections.Generic; -using System.ComponentModel; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Omics.Modifications; +using Omics.Modifications; namespace Omics.Digestion { @@ -49,16 +43,17 @@ protected static IEnumerable> GetVariableModificat } else { - var possible_variable_modifications = new Dictionary>(possibleVariableModifications); + var possibleVariableModificationsCopy = new Dictionary>(possibleVariableModifications); + int[] baseVariableModificationPattern = new int[peptideLength + 4]; + int totalAvailableMods = possibleVariableModificationsCopy.Values.Sum(modList => modList?.Count ?? 0); + int maxVariableMods = Math.Min(totalAvailableMods, maxModsForPeptide); - int[] base_variable_modification_pattern = new int[peptideLength + 4]; - var totalAvailableMods = possible_variable_modifications.Sum(b => b.Value == null ? 0 : b.Value.Count); - for (int variable_modifications = 0; variable_modifications <= Math.Min(totalAvailableMods, maxModsForPeptide); variable_modifications++) + for (int variable_modifications = 0; variable_modifications <= maxVariableMods; variable_modifications++) { - foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(new List>>(possible_variable_modifications), - possible_variable_modifications.Count - variable_modifications, base_variable_modification_pattern, 0)) + foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModificationsCopy.ToList(), + possibleVariableModificationsCopy.Count - variable_modifications, baseVariableModificationPattern, 0)) { - yield return GetNewVariableModificationPattern(variable_modification_pattern, possible_variable_modifications); + yield return GetNewVariableModificationPattern(variable_modification_pattern, possibleVariableModificationsCopy); } } } @@ -77,17 +72,15 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "N-terminal.": case "Peptide N-terminal.": //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginign of the protein - if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) + if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) { - if (OneBasedStartResidue != 1) + if (mod.ModificationType == "Protease") { - fixedModsOneIsNterminus[2] = mod; + if (OneBasedStartResidue != 1) + fixedModsOneIsNterminus[2] = mod; } - } - //Normal N-terminal peptide modification - else if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) - { - fixedModsOneIsNterminus[1] = mod; + else //Normal N-terminal peptide modification + fixedModsOneIsNterminus[1] = mod; } break; @@ -106,17 +99,15 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "C-terminal.": case "Peptide C-terminal.": //the modification is protease associated and is applied to the c-terminal cleaved residue, not if it is at the end of the protein - if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) + if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) { - if (OneBasedEndResidue != Parent.Length) + if (mod.ModificationType == "Protease") { - fixedModsOneIsNterminus[length + 1] = mod; + if (OneBasedEndResidue != Parent.Length) + fixedModsOneIsNterminus[length + 1] = mod; } - } - //Normal C-terminal peptide modification - else if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) - { - fixedModsOneIsNterminus[length + 2] = mod; + else //Normal C-terminal peptide modification + fixedModsOneIsNterminus[length + 2] = mod; } break; @@ -127,7 +118,6 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in return fixedModsOneIsNterminus; } - private static IEnumerable GetVariableModificationPatterns(List>> possibleVariableModifications, int unmodifiedResiduesDesired, int[] variableModificationPattern, int index) { @@ -174,9 +164,9 @@ private static IEnumerable GetVariableModificationPatterns(List GetNewVariableModificationPattern(int[] variableModificationArray, - IEnumerable>> possibleVariableModifications) + Dictionary> possibleVariableModifications) { - var modification_pattern = new Dictionary(); + var modification_pattern = new Dictionary(possibleVariableModifications.Count); foreach (KeyValuePair> kvp in possibleVariableModifications) { @@ -188,7 +178,5 @@ private static Dictionary GetNewVariableModificationPattern(i return modification_pattern; } - - } } diff --git a/mzLib/Omics/Modifications/ModificationLocalization.cs b/mzLib/Omics/Modifications/ModificationLocalization.cs index bbf25d1a3..e2c57fa2d 100644 --- a/mzLib/Omics/Modifications/ModificationLocalization.cs +++ b/mzLib/Omics/Modifications/ModificationLocalization.cs @@ -2,19 +2,28 @@ { public static class ModificationLocalization { + // This method is called a ton (8.8 billion times in Bottom-Up Jenkins as of 1.0.6) in MetaMorpheus. If changes are made, ensure they are efficient. public static bool ModFits(Modification attemptToLocalize, string sequence, int digestionProductOneBasedIndex, int digestionProductLength, int bioPolymerOneBasedIndex) { // First find the capital letter... - var motif = attemptToLocalize.Target; - var motifStartLocation = motif.ToString().IndexOf(motif.ToString().First(b => char.IsUpper(b))); + var motif = attemptToLocalize.Target.ToString(); + var motifStartLocation = -1; + for (int i = 0; i < motif.Length; i++) + { + if (!char.IsUpper(motif[i])) + continue; + + motifStartLocation = i; + break; + } // Look up starting at and including the capital letter var proteinToMotifOffset = bioPolymerOneBasedIndex - motifStartLocation - 1; var indexUp = 0; - while (indexUp < motif.ToString().Length) + while (indexUp < motif.Length) { if (indexUp + proteinToMotifOffset < 0 || indexUp + proteinToMotifOffset >= sequence.Length - || !MotifMatches(motif.ToString()[indexUp], sequence[indexUp + proteinToMotifOffset])) + || !MotifMatches(motif[indexUp], sequence[indexUp + proteinToMotifOffset])) { return false; } @@ -56,11 +65,14 @@ public static bool UniprotModExists(IBioPolymer bioPolymer, int i, Modification private static bool MotifMatches(char motifChar, char sequenceChar) { char upperMotifChar = char.ToUpper(motifChar); - return upperMotifChar.Equals('X') - || upperMotifChar.Equals(sequenceChar) - || upperMotifChar.Equals('B') && new[] { 'D', 'N' }.Contains(sequenceChar) - || upperMotifChar.Equals('J') && new[] { 'I', 'L' }.Contains(sequenceChar) - || upperMotifChar.Equals('Z') && new[] { 'E', 'Q' }.Contains(sequenceChar); + return upperMotifChar switch + { + 'X' => true, + 'B' => sequenceChar is 'D' or 'N', + 'J' => sequenceChar is 'I' or 'L', + 'Z' => sequenceChar is 'E' or 'Q', + _ => upperMotifChar == sequenceChar + }; } } } \ No newline at end of file diff --git a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs index 954ce449a..ee7b8d112 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; -using System.Linq; -using System.Security.Cryptography; +using MzLibUtil; using Omics.Digestion; using Omics.Modifications; @@ -14,7 +13,7 @@ namespace Proteomics.ProteolyticDigestion [Serializable] public class ProteolyticPeptide : DigestionProduct { - + private static readonly DictionaryPool> DictionaryPool = new(8); internal ProteolyticPeptide(Protein protein, int oneBasedStartResidueInProtein, int oneBasedEndResidueInProtein, int missedCleavages, CleavageSpecificity cleavageSpecificityForFdrCategory, string peptideDescription = null, string baseSequence = null) : base(protein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificityForFdrCategory, peptideDescription, baseSequence) { @@ -51,76 +50,34 @@ public string PeptideDescription /// /// /// - internal IEnumerable GetModifiedPeptides(IEnumerable allKnownFixedModifications, + internal IEnumerable GetModifiedPeptides(List allKnownFixedModifications, DigestionParams digestionParams, List variableModifications) { int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; int maxModsForPeptide = digestionParams.MaxModsForPeptide; - var twoBasedPossibleVariableAndLocalizeableModifications = new Dictionary>(peptideLength + 4); - - var pepNTermVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(1, pepNTermVariableMods); + var twoBasedPossibleVariableAndLocalizeableModifications = DictionaryPool.Get(); - var pepCTermVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(peptideLength + 2, pepCTermVariableMods); - - foreach (Modification variableModification in variableModifications) + try { - // Check if can be a n-term mod - if (CanBeNTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, 1, variableModification)) - { - pepNTermVariableMods.Add(variableModification); - } + var pepNTermVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(1, pepNTermVariableMods); + + var pepCTermVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(peptideLength + 2, pepCTermVariableMods); - for (int r = 0; r < peptideLength; r++) + foreach (Modification variableModification in variableModifications) { - if (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) - && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Protein, r + 1, variableModification)) + // Check if can be a n-term mod + if (CanBeNTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, 1, variableModification)) { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } + pepNTermVariableMods.Add(variableModification); } - } - // Check if can be a c-term mod - if (CanBeCTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, peptideLength, variableModification)) - { - pepCTermVariableMods.Add(variableModification); - } - } - - // LOCALIZED MODS - foreach (var kvp in Protein.OneBasedPossibleLocalizedModifications) - { - bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; - if (!inBounds) - { - continue; - } - int locInPeptide = kvp.Key - OneBasedStartResidueInProtein + 1; - foreach (Modification modWithMass in kvp.Value) - { - if (modWithMass is Modification variableModification) + for (int r = 0; r < peptideLength; r++) { - // Check if can be a n-term mod - if (locInPeptide == 1 && CanBeNTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) - { - pepNTermVariableMods.Add(variableModification); - } - - int r = locInPeptide - 1; - if (r >= 0 && r < peptideLength - && (Protein.IsDecoy || - (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidueInProtein + r) - && variableModification.LocationRestriction == "Anywhere."))) + if (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Protein, r + 1, variableModification)) { if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) { @@ -132,36 +89,85 @@ internal IEnumerable GetModifiedPeptides(IEnumerabl residueVariableMods.Add(variableModification); } } + } + // Check if can be a c-term mod + if (CanBeCTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, peptideLength, variableModification)) + { + pepCTermVariableMods.Add(variableModification); + } + } + + // LOCALIZED MODS + foreach (var kvp in Protein.OneBasedPossibleLocalizedModifications) + { + bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; + if (!inBounds) + { + continue; + } - // Check if can be a c-term mod - if (locInPeptide == peptideLength && CanBeCTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) + int locInPeptide = kvp.Key - OneBasedStartResidueInProtein + 1; + foreach (Modification modWithMass in kvp.Value) + { + if (modWithMass is Modification variableModification) { - pepCTermVariableMods.Add(variableModification); + // Check if can be a n-term mod + if (locInPeptide == 1 && CanBeNTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) + { + pepNTermVariableMods.Add(variableModification); + } + + int r = locInPeptide - 1; + if (r >= 0 && r < peptideLength + && (Protein.IsDecoy || + (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidueInProtein + r) + && variableModification.LocationRestriction == "Anywhere."))) + { + if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) + { + residueVariableMods = new List { variableModification }; + twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + + // Check if can be a c-term mod + if (locInPeptide == peptideLength && CanBeCTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) + { + pepCTermVariableMods.Add(variableModification); + } } } } - } - int variable_modification_isoforms = 0; + int variable_modification_isoforms = 0; - foreach (Dictionary kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) - { - int numFixedMods = 0; - foreach (var ok in GetFixedModsOneIsNorFivePrimeTerminus(peptideLength, allKnownFixedModifications)) + foreach (Dictionary kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) { - if (!kvp.ContainsKey(ok.Key)) + int numFixedMods = 0; + foreach (var ok in GetFixedModsOneIsNorFivePrimeTerminus(peptideLength, allKnownFixedModifications)) + { + if (!kvp.ContainsKey(ok.Key)) + { + numFixedMods++; + kvp.Add(ok.Key, ok.Value); + } + } + yield return new PeptideWithSetModifications(Protein, digestionParams, OneBasedStartResidue, OneBasedEndResidue, + CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, kvp, numFixedMods); + variable_modification_isoforms++; + if (variable_modification_isoforms == maximumVariableModificationIsoforms) { - numFixedMods++; - kvp.Add(ok.Key, ok.Value); + yield break; } } - yield return new PeptideWithSetModifications(Protein, digestionParams, OneBasedStartResidue, OneBasedEndResidue, - CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, kvp, numFixedMods); - variable_modification_isoforms++; - if (variable_modification_isoforms == maximumVariableModificationIsoforms) - { - yield break; - } + } + finally + { + DictionaryPool.Return(twoBasedPossibleVariableAndLocalizeableModifications); } } From d68e40f7e5ea78f300b6ba5159a9923e22db6a99 Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 20:48:30 -0600 Subject: [PATCH 06/21] Moved testing class to proper subdirectory --- .../TestMsFraggerCombinedResults.cs | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) rename mzLib/Test/{ => FileReadingTests}/TestMsFraggerCombinedResults.cs (83%) diff --git a/mzLib/Test/TestMsFraggerCombinedResults.cs b/mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs similarity index 83% rename from mzLib/Test/TestMsFraggerCombinedResults.cs rename to mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs index bd5d8834d..731284adb 100644 --- a/mzLib/Test/TestMsFraggerCombinedResults.cs +++ b/mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs @@ -1,15 +1,12 @@ using NUnit.Framework; using Readers; -using System; using System.Collections.Generic; using System.Linq; -using Assert = NUnit.Framework.Legacy.ClassicAssert; using System.IO; using TopDownProteomics; -using OxyPlot; using System.Diagnostics.CodeAnalysis; -namespace Test +namespace Test.FileReadingTests { [ExcludeFromCodeCoverage] internal class TestMsFraggerCombinedResults @@ -22,8 +19,8 @@ public void TestLoadResultsCount(string path) MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); ms.LoadResults(); - Assert.That(ms.AllPsmFiles.Count.Equals(2)); - Assert.That(ms.Results.Count.Equals(8)); + NUnit.Framework.Assert.That(ms.AllPsmFiles.Count.Equals(2)); + NUnit.Framework.Assert.That(ms.Results.Count.Equals(8)); } [Test] @@ -36,8 +33,8 @@ public void TestLoadResults(string path) List results = ms.Results.Select(psm => psm.FileName).ToList(); - Assert.That((results.Count(s => s.Contains("A_1"))).Equals(4)); - Assert.That((results.Count(s => s.Contains("A_2"))).Equals(4)); + NUnit.Framework.Assert.That(results.Count(s => s.Contains("A_1")).Equals(4)); + NUnit.Framework.Assert.That(results.Count(s => s.Contains("A_2")).Equals(4)); } [Test] @@ -61,8 +58,8 @@ public void TestFileNameToFilePathWithParameter(string path) foreach (var fileName in results) { - Assert.That(allFiles.TryGetValue(fileName, out var output)); - Assert.That(filePaths.Contains(output)); + NUnit.Framework.Assert.That(allFiles.TryGetValue(fileName, out var output)); + NUnit.Framework.Assert.That(filePaths.Contains(output)); } } @@ -80,8 +77,8 @@ public void TestFileNameToFilePathWithoutParameter(string path) foreach (var fileName in results) { - Assert.That(allFiles.TryGetValue(fileName, out var output)); - Assert.That(filePaths.Contains(output)); + NUnit.Framework.Assert.That(allFiles.TryGetValue(fileName, out var output)); + NUnit.Framework.Assert.That(filePaths.Contains(output)); } } @@ -101,7 +98,7 @@ public void TestExperimentAnnotationFile(string path) ExperimentAnnotationFile experimentAnnotation = FileReader.ReadFile(fileToRead); experimentAnnotation.WriteResults(fileToWrite); - Assert.That(File.Exists(fileToWrite)); + NUnit.Framework.Assert.That(File.Exists(fileToWrite)); File.Delete(fileToWrite); } From 585ffc54e4c7f05908cf569be4cd184d9321b202 Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 21:12:05 -0600 Subject: [PATCH 07/21] Adjusted ModFits to have the correct localization for peptide and protein termini --- .../Modifications/ModificationLocalization.cs | 4 +-- mzLib/Test/TestProteinDigestion.cs | 35 +++++++++++++++++-- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/mzLib/Omics/Modifications/ModificationLocalization.cs b/mzLib/Omics/Modifications/ModificationLocalization.cs index e2c57fa2d..8928d8c9a 100644 --- a/mzLib/Omics/Modifications/ModificationLocalization.cs +++ b/mzLib/Omics/Modifications/ModificationLocalization.cs @@ -32,9 +32,9 @@ public static bool ModFits(Modification attemptToLocalize, string sequence, int switch (attemptToLocalize.LocationRestriction) { case "N-terminal." when bioPolymerOneBasedIndex > 2: - case "Peptide N-terminal." when digestionProductOneBasedIndex > 1: + case "Peptide N-terminal." when digestionProductOneBasedIndex > 1 || bioPolymerOneBasedIndex == 1: case "C-terminal." when bioPolymerOneBasedIndex < sequence.Length: - case "Peptide C-terminal." when digestionProductOneBasedIndex < digestionProductLength: + case "Peptide C-terminal." when digestionProductOneBasedIndex < digestionProductLength || bioPolymerOneBasedIndex == sequence.Length: case "5'-terminal." when bioPolymerOneBasedIndex > 2: // first residue in oligo but not first in nucleic acid case "Oligo 5'-terminal." when digestionProductOneBasedIndex > 1 diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index 02cc3aed5..821288dae 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -220,7 +220,7 @@ public static void TestPeptideWithSetModifications() variableModifications.Add(new Modification(_originalId: "ProtCmod", _target: motif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); var ye = prot.Digest(digestionParams, new List(), variableModifications).ToList(); - Assert.AreEqual(3 * 2 * 3, ye.Count); + Assert.AreEqual(2 * 2 * 2, ye.Count); Assert.AreEqual("[H]M[H][H]", ye.Last().SequenceWithChemicalFormulas); double m1 = 5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass; @@ -251,12 +251,43 @@ public static void TestPeptideWithFixedModifications() Assert.AreEqual(1, ok.Count); - Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence); + Assert.AreEqual("[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence); Assert.AreEqual("[H]M[H][H]", ok.First().SequenceWithChemicalFormulas); Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9); } + [Test] + public static void TestPeptideWithFixedModifications_TwoProducts() + { + var prot = new Protein("MKM", null); + DigestionParams digestionParams = new DigestionParams(maxMissedCleavages: 0, minPeptideLength: 1, maxModsForPeptides: 3, initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + List fixedMods = new List(); + ModificationMotif.TryGetMotif("M", out ModificationMotif mMotif); + ModificationMotif.TryGetMotif("K", out ModificationMotif kMotif); + + fixedMods.Add(new Modification(_originalId: "ProtNmod", _target: mMotif, _locationRestriction: "N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "ProtNmod", _target: kMotif, _locationRestriction: "N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "pepNmod", _target: mMotif, _locationRestriction: "Peptide N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "pepNmod", _target: kMotif, _locationRestriction: "Peptide N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "resMod", _target: mMotif, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "PepCmod", _target: mMotif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "PepCmod", _target: kMotif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "ProtCmod", _target: mMotif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "ProtCmod", _target: kMotif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + + var ok = prot.Digest(digestionParams, fixedMods, new List()).ToList(); + + Assert.AreEqual(2, ok.Count); + + Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K[:PepCmod on K]", ok.First().FullSequence); + Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.Skip(1).First().FullSequence); + + Assert.AreEqual("[H]M[H]K[H]", ok.First().SequenceWithChemicalFormulas); + Assert.AreEqual("[H]M[H][H]", ok.Skip(1).First().SequenceWithChemicalFormulas); + Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9); + } + [Test] public static void TestDigestIndices() { From 513a12b0e654128ceb472373d440c28f72989940 Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 21:29:33 -0600 Subject: [PATCH 08/21] Cleaned up hashset return --- mzLib/Omics/Digestion/DigestionAgent.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionAgent.cs b/mzLib/Omics/Digestion/DigestionAgent.cs index 9d9d6083b..d2f3b37db 100644 --- a/mzLib/Omics/Digestion/DigestionAgent.cs +++ b/mzLib/Omics/Digestion/DigestionAgent.cs @@ -76,9 +76,7 @@ protected static bool ValidMaxLength(int? length, int maxLength) /// public List GetDigestionSiteIndices(string sequence) { - List? indicesList; var indices = HashSetPool.Get(); // use hash set to ensure no duplicates - try { indices.Add(0); // The start of the protein is treated as a cleavage site to retain the n-terminal peptide @@ -113,13 +111,12 @@ public List GetDigestionSiteIndices(string sequence) } indices.Add(sequence.Length); // The end of the protein is treated as a cleavage site to retain the c-terminal peptide + return indices.ToList(); } finally { - indicesList = indices.ToList(); HashSetPool.Return(indices); } - return indicesList; } } } From b13e9402c699e77fba668d05ffe60ebaff37d85b Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 21:31:01 -0600 Subject: [PATCH 09/21] Digestion Agent Hashset Return Cleanup --- mzLib/Omics/Digestion/DigestionAgent.cs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionAgent.cs b/mzLib/Omics/Digestion/DigestionAgent.cs index 9d9d6083b..d2f3b37db 100644 --- a/mzLib/Omics/Digestion/DigestionAgent.cs +++ b/mzLib/Omics/Digestion/DigestionAgent.cs @@ -76,9 +76,7 @@ protected static bool ValidMaxLength(int? length, int maxLength) /// public List GetDigestionSiteIndices(string sequence) { - List? indicesList; var indices = HashSetPool.Get(); // use hash set to ensure no duplicates - try { indices.Add(0); // The start of the protein is treated as a cleavage site to retain the n-terminal peptide @@ -113,13 +111,12 @@ public List GetDigestionSiteIndices(string sequence) } indices.Add(sequence.Length); // The end of the protein is treated as a cleavage site to retain the c-terminal peptide + return indices.ToList(); } finally { - indicesList = indices.ToList(); HashSetPool.Return(indices); } - return indicesList; } } } From db610b25559871c945d8e954e96dd0f9744e1484 Mon Sep 17 00:00:00 2001 From: nbollis Date: Mon, 13 Jan 2025 22:32:57 -0600 Subject: [PATCH 10/21] set fixed mods now modifies in place using a pooled dictionary --- mzLib/Omics/Digestion/DigestionProduct.cs | 23 +-- .../ProteolyticPeptide.cs | 7 +- .../Digestion/NucleolyticOligo.cs | 166 ++++++++++-------- 3 files changed, 104 insertions(+), 92 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index 9a5862497..fde32cf96 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -1,9 +1,13 @@ -using Omics.Modifications; +using MzLibUtil; +using Omics.Modifications; namespace Omics.Digestion { public abstract class DigestionProduct { + protected static readonly DictionaryPool> DictionaryPool = new(); + protected static readonly DictionaryPool FixedModDictionaryPool = new(8); + protected string _baseSequence; protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int oneBasedEndResidue, int missedCleavages, @@ -43,26 +47,24 @@ protected static IEnumerable> GetVariableModificat } else { - var possibleVariableModificationsCopy = new Dictionary>(possibleVariableModifications); int[] baseVariableModificationPattern = new int[peptideLength + 4]; - int totalAvailableMods = possibleVariableModificationsCopy.Values.Sum(modList => modList?.Count ?? 0); + int totalAvailableMods = possibleVariableModifications.Values.Sum(modList => modList?.Count ?? 0); int maxVariableMods = Math.Min(totalAvailableMods, maxModsForPeptide); for (int variable_modifications = 0; variable_modifications <= maxVariableMods; variable_modifications++) { - foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModificationsCopy.ToList(), - possibleVariableModificationsCopy.Count - variable_modifications, baseVariableModificationPattern, 0)) + foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications.ToList(), + possibleVariableModifications.Count - variable_modifications, baseVariableModificationPattern, 0)) { - yield return GetNewVariableModificationPattern(variable_modification_pattern, possibleVariableModificationsCopy); + yield return GetNewVariableModificationPattern(variable_modification_pattern, possibleVariableModifications); } } } } - protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(int length, - IEnumerable allKnownFixedModifications) + protected void SetFixedModsOneIsNorFivePrimeTerminus(int length, + IEnumerable allKnownFixedModifications, ref Dictionary fixedModsOneIsNterminus) { - var fixedModsOneIsNterminus = new Dictionary(length + 3); foreach (Modification mod in allKnownFixedModifications) { switch (mod.LocationRestriction) @@ -71,7 +73,7 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "Oligo 5'-terminal.": case "N-terminal.": case "Peptide N-terminal.": - //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginign of the protein + //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginning of the protein if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) { if (mod.ModificationType == "Protease") @@ -115,7 +117,6 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in throw new NotSupportedException("This terminus localization is not supported."); } } - return fixedModsOneIsNterminus; } private static IEnumerable GetVariableModificationPatterns(List>> possibleVariableModifications, diff --git a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs index ee7b8d112..5fb07fa0b 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -1,6 +1,5 @@ using System; using System.Collections.Generic; -using MzLibUtil; using Omics.Digestion; using Omics.Modifications; @@ -13,7 +12,6 @@ namespace Proteomics.ProteolyticDigestion [Serializable] public class ProteolyticPeptide : DigestionProduct { - private static readonly DictionaryPool> DictionaryPool = new(8); internal ProteolyticPeptide(Protein protein, int oneBasedStartResidueInProtein, int oneBasedEndResidueInProtein, int missedCleavages, CleavageSpecificity cleavageSpecificityForFdrCategory, string peptideDescription = null, string baseSequence = null) : base(protein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificityForFdrCategory, peptideDescription, baseSequence) { @@ -57,6 +55,7 @@ internal IEnumerable GetModifiedPeptides(List GetModifiedPeptides(List kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) { int numFixedMods = 0; - foreach (var ok in GetFixedModsOneIsNorFivePrimeTerminus(peptideLength, allKnownFixedModifications)) + foreach (var ok in fixedModDictionary) { if (!kvp.ContainsKey(ok.Key)) { @@ -167,6 +167,7 @@ internal IEnumerable GetModifiedPeptides(List GenerateModifiedOligos(List int oligoLength = OneBasedEndResidue - OneBasedStartResidue + 1; int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; int maxModsForOligo = digestionParams.MaxMods; - var twoBasedPossibleVariableAndLocalizeableModifications = new Dictionary>(oligoLength + 4); + var twoBasedPossibleVariableAndLocalizeableModifications = DictionaryPool.Get(); + var fixedModDictionary = FixedModDictionaryPool.Get(); - var fivePrimeVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(1, fivePrimeVariableMods); - - var threePrimeVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(oligoLength + 2, threePrimeVariableMods); - - // collect all possible variable mods, skipping if there is a database annotated modification - foreach (Modification variableModification in variableModifications) + try { - // Check if can be a 5'-term mod - if (CanBeFivePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, 1, variableModification)) - { - fivePrimeVariableMods.Add(variableModification); - } + var fivePrimeVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(1, fivePrimeVariableMods); + + var threePrimeVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(oligoLength + 2, threePrimeVariableMods); - for (int r = 0; r < oligoLength; r++) + // collect all possible variable mods, skipping if there is a database annotated modification + foreach (Modification variableModification in variableModifications) { - if (variableModification.LocationRestriction == "Anywhere." && - ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) - && !ModificationLocalization.UniprotModExists(NucleicAcid, r + 1, variableModification)) + // Check if can be a 5'-term mod + if (CanBeFivePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, 1, variableModification)) { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } + fivePrimeVariableMods.Add(variableModification); } - } - // Check if can be a 3'-term mod - if (CanBeThreePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, oligoLength, variableModification)) - { - threePrimeVariableMods.Add(variableModification); - } - } - - // collect all localized modifications from the database. - foreach (var kvp in NucleicAcid.OneBasedPossibleLocalizedModifications) - { - bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; - if (!inBounds) - { - continue; - } - int locInPeptide = kvp.Key - OneBasedStartResidue + 1; - foreach (Modification modWithMass in kvp.Value) - { - if (modWithMass is Modification variableModification) + for (int r = 0; r < oligoLength; r++) { - // Check if can be a 5'-term mod - if (locInPeptide == 1 && CanBeFivePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) - { - fivePrimeVariableMods.Add(variableModification); - } - - int r = locInPeptide - 1; - if (r >= 0 && r < oligoLength - && (NucleicAcid.IsDecoy || - (ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) - && variableModification.LocationRestriction == "Anywhere."))) + if (variableModification.LocationRestriction == "Anywhere." && + ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) + && !ModificationLocalization.UniprotModExists(NucleicAcid, r + 1, variableModification)) { if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) { @@ -129,37 +88,88 @@ internal IEnumerable GenerateModifiedOligos(List residueVariableMods.Add(variableModification); } } + } + // Check if can be a 3'-term mod + if (CanBeThreePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, oligoLength, variableModification)) + { + threePrimeVariableMods.Add(variableModification); + } + } + + // collect all localized modifications from the database. + foreach (var kvp in NucleicAcid.OneBasedPossibleLocalizedModifications) + { + bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; + if (!inBounds) + { + continue; + } - // Check if can be a 3'-term mod - if (locInPeptide == oligoLength && CanBeThreePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + int locInPeptide = kvp.Key - OneBasedStartResidue + 1; + foreach (Modification modWithMass in kvp.Value) + { + if (modWithMass is Modification variableModification) { - threePrimeVariableMods.Add(variableModification); + // Check if can be a 5'-term mod + if (locInPeptide == 1 && CanBeFivePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + { + fivePrimeVariableMods.Add(variableModification); + } + + int r = locInPeptide - 1; + if (r >= 0 && r < oligoLength + && (NucleicAcid.IsDecoy || + (ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere."))) + { + if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) + { + residueVariableMods = new List { variableModification }; + twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + + // Check if can be a 3'-term mod + if (locInPeptide == oligoLength && CanBeThreePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + { + threePrimeVariableMods.Add(variableModification); + } } } } - } - int variableModificationIsoforms = 0; + int variableModificationIsoforms = 0; + SetFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, ref fixedModDictionary); - // Add the mods to the oligo by return numerous OligoWithSetMods - foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) - { - int numFixedMods = 0; - foreach (var fixedModPattern in GetFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods)) + // Add the mods to the oligo by return numerous OligoWithSetMods + foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) { - if (!variableModPattern.ContainsKey(fixedModPattern.Key)) + int numFixedMods = 0; + foreach (var fixedModPattern in fixedModDictionary) + { + if (!variableModPattern.ContainsKey(fixedModPattern.Key)) + { + numFixedMods++; + variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); + } + } + yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, + CleavageSpecificityForFdrCategory, variableModPattern, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); + variableModificationIsoforms++; + if (variableModificationIsoforms == maximumVariableModificationIsoforms) { - numFixedMods++; - variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); + yield break; } } - yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, - CleavageSpecificityForFdrCategory, variableModPattern, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); - variableModificationIsoforms++; - if (variableModificationIsoforms == maximumVariableModificationIsoforms) - { - yield break; - } + } + finally + { + DictionaryPool.Return(twoBasedPossibleVariableAndLocalizeableModifications); + FixedModDictionaryPool.Return(fixedModDictionary); } } From 468c46d8062ada2f55196a7e80c65155a4029a62 Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 14 Jan 2025 13:03:34 -0600 Subject: [PATCH 11/21] Added comments to digeston --- mzLib/Omics/Digestion/DigestionProduct.cs | 66 +++++++++++++++++------ mzLib/mzLib.nuspec | 12 ++--- 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index fde32cf96..a73820248 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -39,6 +39,19 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one public int Length => BaseSequence.Length; //how many residues long the peptide is public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex]; + /// + /// Generates all possible variable modification patterns for a peptide. + /// + /// A dictionary of possible variable modifications with their positions. + /// The maximum number of modifications allowed for the peptide. + /// The length of the peptide. + /// An enumerable of dictionaries representing different modification patterns. + /// + /// This method generates all possible combinations of variable modifications for a given peptide. + /// It first calculates the total number of available modifications and the maximum number of variable modifications allowed. + /// Then, it iterates through all possible numbers of modifications and generates the corresponding modification patterns. + /// The returned dictionary is then appended with fixed modifications and used to construct a peptide with set mods + /// protected static IEnumerable> GetVariableModificationPatterns(Dictionary> possibleVariableModifications, int maxModsForPeptide, int peptideLength) { if (possibleVariableModifications.Count == 0) @@ -56,12 +69,33 @@ protected static IEnumerable> GetVariableModificat foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications.ToList(), possibleVariableModifications.Count - variable_modifications, baseVariableModificationPattern, 0)) { - yield return GetNewVariableModificationPattern(variable_modification_pattern, possibleVariableModifications); + // use modification pattern to construct a dictionary of modifications for the peptide + var modificationPattern = new Dictionary(possibleVariableModifications.Count); + + foreach (KeyValuePair> kvp in possibleVariableModifications) + { + if (variable_modification_pattern[kvp.Key] > 0) + { + modificationPattern.Add(kvp.Key, kvp.Value[variable_modification_pattern[kvp.Key] - 1]); + } + } + + yield return modificationPattern; } } } } + /// + /// Sets the fixed modifications for the peptide, considering the N-terminal and C-terminal positions, by populating the dictionary. + /// + /// The length of the peptide. + /// A collection of all known fixed modifications. + /// A reference to a dictionary that will hold the fixed modifications, with the key representing the position. + /// + /// This method iterates through all known fixed modifications and assigns them to the appropriate positions in the peptide. + /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. + /// protected void SetFixedModsOneIsNorFivePrimeTerminus(int length, IEnumerable allKnownFixedModifications, ref Dictionary fixedModsOneIsNterminus) { @@ -119,6 +153,20 @@ protected void SetFixedModsOneIsNorFivePrimeTerminus(int length, } } + /// + /// Recursively generates all possible variable modification patterns for a peptide. + /// + /// A list of key-value pairs representing possible variable modifications and their positions. + /// The number of unmodified residues desired in the pattern. + /// An array representing the current modification pattern. + /// The current index in the list of possible modifications. + /// An enumerable of arrays representing different modification patterns. The array index corresponds to the location of the modification + /// in the peptide, while the value at that index determines which index in the list of modifications + /// to add to the final variable modification pattern + /// + /// This method uses recursion to generate all possible combinations of variable modifications for a given peptide. + /// It considers both modified and unmodified residues and generates patterns accordingly. + /// private static IEnumerable GetVariableModificationPatterns(List>> possibleVariableModifications, int unmodifiedResiduesDesired, int[] variableModificationPattern, int index) { @@ -163,21 +211,5 @@ private static IEnumerable GetVariableModificationPatterns(List GetNewVariableModificationPattern(int[] variableModificationArray, - Dictionary> possibleVariableModifications) - { - var modification_pattern = new Dictionary(possibleVariableModifications.Count); - - foreach (KeyValuePair> kvp in possibleVariableModifications) - { - if (variableModificationArray[kvp.Key] > 0) - { - modification_pattern.Add(kvp.Key, kvp.Value[variableModificationArray[kvp.Key] - 1]); - } - } - - return modification_pattern; - } } } diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 8cee705ec..b04e6c2e5 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 1.0.547 + 5.2.44 mzLib Stef S. Stef S. @@ -16,8 +16,8 @@ - - + + @@ -29,8 +29,8 @@ - - + + @@ -87,4 +87,4 @@ - + \ No newline at end of file From 2c12e594391bb1519031a8909d650169c2db9ea5 Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 14 Jan 2025 14:17:49 -0600 Subject: [PATCH 12/21] Refactor code for readability and efficiency - Simplified initial check for `possibleVariableModifications.Count` and replaced `yield return null` with `yield break`. - Adjusted indentation and loop structure for clarity. - Refactored nested loop to remove unnecessary braces and streamline logic. - Simplified construction of `modificationPattern` dictionary by removing redundant checks and directly using `modIndex`. --- mzLib/Omics/Digestion/DigestionProduct.cs | 41 +++++++++++------------ 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index a73820248..db1c30bb3 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -54,34 +54,31 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one /// protected static IEnumerable> GetVariableModificationPatterns(Dictionary> possibleVariableModifications, int maxModsForPeptide, int peptideLength) { - if (possibleVariableModifications.Count == 0) - { - yield return null; - } - else - { - int[] baseVariableModificationPattern = new int[peptideLength + 4]; - int totalAvailableMods = possibleVariableModifications.Values.Sum(modList => modList?.Count ?? 0); - int maxVariableMods = Math.Min(totalAvailableMods, maxModsForPeptide); + if (possibleVariableModifications.Count <= 0) + yield break; + + int[] baseVariableModificationPattern = new int[peptideLength + 4]; + int totalAvailableMods = possibleVariableModifications.Values.Sum(modList => modList?.Count ?? 0); + int maxVariableMods = Math.Min(totalAvailableMods, maxModsForPeptide); - for (int variable_modifications = 0; variable_modifications <= maxVariableMods; variable_modifications++) + for (int variable_modifications = 0; variable_modifications <= maxVariableMods; variable_modifications++) + { + foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications.ToList(), + possibleVariableModifications.Count - variable_modifications, baseVariableModificationPattern, 0)) { - foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications.ToList(), - possibleVariableModifications.Count - variable_modifications, baseVariableModificationPattern, 0)) - { - // use modification pattern to construct a dictionary of modifications for the peptide - var modificationPattern = new Dictionary(possibleVariableModifications.Count); + // use modification pattern to construct a dictionary of modifications for the peptide + var modificationPattern = new Dictionary(possibleVariableModifications.Count); - foreach (KeyValuePair> kvp in possibleVariableModifications) + foreach (KeyValuePair> kvp in possibleVariableModifications) + { + int modIndex = variable_modification_pattern[kvp.Key] - 1; + if (modIndex >= 0) { - if (variable_modification_pattern[kvp.Key] > 0) - { - modificationPattern.Add(kvp.Key, kvp.Value[variable_modification_pattern[kvp.Key] - 1]); - } + modificationPattern.Add(kvp.Key, kvp.Value[modIndex]); } - - yield return modificationPattern; } + + yield return modificationPattern; } } } From 81aefc5d1b42706de9f80db76ed8903b56021d2e Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 15 Jan 2025 15:19:45 -0600 Subject: [PATCH 13/21] Added many comments --- mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs | 40 ++++++++++++++++++- mzLib/MzLibUtil/ObjectPools/HashSetPool.cs | 40 ++++++++++++++++--- mzLib/MzLibUtil/ObjectPools/ListPool.cs | 38 +++++++++++++++++- mzLib/Omics/Digestion/DigestionAgent.cs | 5 ++- 4 files changed, 111 insertions(+), 12 deletions(-) diff --git a/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs b/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs index ca2f2f712..0e3efec5c 100644 --- a/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs +++ b/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs @@ -4,7 +4,28 @@ namespace MzLibUtil; -// Used to pool HashSet instances to reduce memory allocations +// Example Usage: +// var pool = new DictionaryPool(); +// var dictionary = pool.Get(); +// try { +// dictionary.Add(1,1); +// Do Work +// } +// finally { +// pool.Return(dictionary); +// } + +/// +/// Provides a pool for instances to reduce memory allocations. +/// This class uses the from Microsoft.Extensions.ObjectPool +/// to manage the pooling of objects. +/// +/// The type of keys in the . +/// The type of values in the . +/// +/// This class is not thread-safe and should not be shared between threads. +/// This class should be pulled from outside a try finally loop and finally should return the Dictionary to the pool to ensure proper pooling in the case of a caught exception. +/// public class DictionaryPool where TKey : notnull { private readonly ObjectPool> _pool; @@ -37,17 +58,32 @@ public void Return(Dictionary dictionary) _pool.Return(dictionary); } + /// + /// Policy for pooling Dictionary instances with a specified initial capacity. + /// + /// The type of keys in the Dictionary. + /// The type of values in the Dictionary. + /// The initial capacity for the pooled Dictionary instances. private class DictionaryPooledObjectPolicy(int initialCapacity) : PooledObjectPolicy> where TKeyItem : notnull { private int InitialCapacity { get; } = initialCapacity; + /// + /// Creates a new Dictionary instance with the specified initial capacity. + /// + /// A new Dictionary instance. public override Dictionary Create() { return new Dictionary(capacity: InitialCapacity); } + /// + /// Returns a Dictionary instance to the pool after clearing it. + /// + /// The Dictionary instance to return. + /// True if the Dictionary instance can be reused; otherwise, false. public override bool Return(Dictionary obj) { // Ensure the Dictionary can be safely reused @@ -55,4 +91,4 @@ public override bool Return(Dictionary obj) return true; } } -} \ No newline at end of file +} diff --git a/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs b/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs index ef3e152d1..ae33ffd99 100644 --- a/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs +++ b/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs @@ -4,14 +4,29 @@ namespace MzLibUtil; + // Example Usage: // var pool = new HashSetPool(); // var hashSet = pool.Get(); +// try { // hashSet.Add(1); // Do Work +// } +// finally { // pool.Return(hashSet); +// } -// Used to pool HashSet instances to reduce memory allocations +/// +/// Provides a pool for instances to reduce memory allocations. +/// This class uses the from Microsoft.Extensions.ObjectPool +/// to manage the pooling of objects. +/// +/// The type of elements in the . +/// +/// This class is not thread-safe and should not be shared between threads. +/// This class should be pulled from outside a try finally loop and finally should return the HashSet to the pool to ensure proper pooling in the case of a caught exception +/// See example found in DigestionAgent.GetDigestionSiteIndices() for proper usage +/// public class HashSetPool { private readonly ObjectPool> _pool; @@ -27,15 +42,15 @@ public HashSetPool(int initialCapacity = 16) } /// - /// Retrieves a HashSet instance from the pool. + /// Retrieves a instance from the pool. /// - /// A HashSet instance. + /// A instance. public HashSet Get() => _pool.Get(); /// - /// Returns a HashSet instance back to the pool. + /// Returns a instance back to the pool. /// - /// The HashSet instance to return. + /// The instance to return. public void Return(HashSet hashSet) { if (hashSet == null) throw new ArgumentNullException(nameof(hashSet)); @@ -43,13 +58,26 @@ public void Return(HashSet hashSet) _pool.Return(hashSet); } + /// + /// Defines the policy for creating and returning instances to the pool. + /// + /// The type of elements in the . private class HashSetPooledObjectPolicy(int initialCapacity) : PooledObjectPolicy> { + /// + /// Creates a new instance with the specified initial capacity. + /// + /// A new instance. public override HashSet Create() { return new HashSet(capacity: initialCapacity); } + /// + /// Returns a instance to the pool after clearing it. + /// + /// The instance to return. + /// Always returns true. public override bool Return(HashSet obj) { // Ensure the HashSet can be safely reused @@ -57,4 +85,4 @@ public override bool Return(HashSet obj) return true; } } -} \ No newline at end of file +} diff --git a/mzLib/MzLibUtil/ObjectPools/ListPool.cs b/mzLib/MzLibUtil/ObjectPools/ListPool.cs index 310369b90..9f25a7926 100644 --- a/mzLib/MzLibUtil/ObjectPools/ListPool.cs +++ b/mzLib/MzLibUtil/ObjectPools/ListPool.cs @@ -4,7 +4,27 @@ namespace MzLibUtil; -// Used to pool HashSet instances to reduce memory allocations +// Example Usage: +// var pool = new ListPool(); +// var list = pool.Get(); +// try { +// list.Add(1); +// Do Work +// } +// finally { +// pool.Return(list); +// } + +/// +/// Provides a pool for instances to reduce memory allocations. +/// This class uses the from Microsoft.Extensions.ObjectPool +/// to manage the pooling of objects. +/// +/// The type of elements in the . +/// +/// This class is not thread-safe and should not be shared between threads. +/// This class should be pulled from outside a try finally loop and finally should return the List to the pool to ensure proper pooling in the case of a caught exception. +/// public class ListPool { private readonly ObjectPool> _pool; @@ -37,18 +57,32 @@ public void Return(List list) _pool.Return(list); } + /// + /// Policy for pooling List instances with a specified initial capacity. + /// + /// The type of elements in the List. + /// The initial capacity for the pooled List instances. private class ListPooledObjectPolicy(int initialCapacity) : PooledObjectPolicy> { private int InitialCapacity { get; } = initialCapacity; + /// + /// Creates a new List instance with the specified initial capacity. + /// + /// A new List instance. public override List Create() { return new List(capacity: InitialCapacity); } + /// + /// Resets the List instance to a clean state before returning it to the pool. + /// + /// The List instance to reset and return. + /// True if the List instance can be returned to the pool; otherwise, false. public override bool Return(List obj) { - // Ensure the HashSet can be safely reused + // Ensure the List can be safely reused obj.Clear(); return true; } diff --git a/mzLib/Omics/Digestion/DigestionAgent.cs b/mzLib/Omics/Digestion/DigestionAgent.cs index d2f3b37db..734d5e65f 100644 --- a/mzLib/Omics/Digestion/DigestionAgent.cs +++ b/mzLib/Omics/Digestion/DigestionAgent.cs @@ -77,7 +77,7 @@ protected static bool ValidMaxLength(int? length, int maxLength) public List GetDigestionSiteIndices(string sequence) { var indices = HashSetPool.Get(); // use hash set to ensure no duplicates - try + try // Try block is to ensure that, even if an error gets thrown, the hashset is returned to the pool { indices.Add(0); // The start of the protein is treated as a cleavage site to retain the n-terminal peptide @@ -111,10 +111,11 @@ public List GetDigestionSiteIndices(string sequence) } indices.Add(sequence.Length); // The end of the protein is treated as a cleavage site to retain the c-terminal peptide - return indices.ToList(); + return indices.ToList(); // convert the hashset to a list for return. } finally { + // return hashset to pool. This clears it and gets it ready for the next time it is needed from the pool. HashSetPool.Return(indices); } } From 43d688218020c25c5257c1943e91adb782cccf91 Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 15 Jan 2025 18:00:54 -0600 Subject: [PATCH 14/21] set fixed mods namechange --- mzLib/Omics/Digestion/DigestionProduct.cs | 2 +- mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs | 2 +- mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index db1c30bb3..d5de0c26b 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -93,7 +93,7 @@ protected static IEnumerable> GetVariableModificat /// This method iterates through all known fixed modifications and assigns them to the appropriate positions in the peptide. /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. /// - protected void SetFixedModsOneIsNorFivePrimeTerminus(int length, + protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length, IEnumerable allKnownFixedModifications, ref Dictionary fixedModsOneIsNterminus) { foreach (Modification mod in allKnownFixedModifications) diff --git a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs index 5fb07fa0b..84275ae99 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -143,7 +143,7 @@ internal IEnumerable GetModifiedPeptides(List kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) { diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index 90c6f1c00..180cb35af 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -143,7 +143,7 @@ internal IEnumerable GenerateModifiedOligos(List } int variableModificationIsoforms = 0; - SetFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, ref fixedModDictionary); + PopulateFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, ref fixedModDictionary); // Add the mods to the oligo by return numerous OligoWithSetMods foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) From ddbcf01acddb4f4a16c5fe2e72bca73d2122575e Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 15 Jan 2025 18:03:17 -0600 Subject: [PATCH 15/21] Eliminated IsN or IS5' in favor of unified method --- mzLib/Omics/Digestion/DigestionProduct.cs | 24 +++++++++++++ .../ProteolyticPeptide.cs | 34 +++---------------- .../Digestion/NucleolyticOligo.cs | 22 +++--------- 3 files changed, 34 insertions(+), 46 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index d5de0c26b..e7eb697bd 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -208,5 +208,29 @@ private static IEnumerable GetVariableModificationPatterns(List + /// Determines if a modification can be applied to the N-terminal or 5' end of the peptide. + /// + /// The modification to check. + /// The length of the peptide. + /// True if the modification can be applied to the N-terminal or 5' end; otherwise, false. + protected bool CanBeNTerminalOrFivePrime(Modification mod, int peptideLength) + { + return mod.LocationRestriction is "5'-terminal." or "Oligo 5'-terminal." or "N-terminal." or "Peptide N-terminal." + && ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, peptideLength, OneBasedStartResidue); + } + + /// + /// Determines if a modification can be applied to the C-terminal or 3' end of the peptide. + /// + /// The modification to check. + /// The length of the peptide. + /// True if the modification can be applied to the C-terminal or 3' end; otherwise, false. + protected bool CanBeCTerminalOrThreePrime(Modification mod, int peptideLength) + { + return mod.LocationRestriction is "3'-terminal." or "Oligo 3'-terminal." or "C-terminal." or "Peptide C-terminal." + && ModificationLocalization.ModFits(mod, Parent.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1); + } } } diff --git a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs index 84275ae99..8bb64157b 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -51,6 +51,7 @@ public string PeptideDescription internal IEnumerable GetModifiedPeptides(List allKnownFixedModifications, DigestionParams digestionParams, List variableModifications) { + int variable_modification_isoforms = 0; int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; int maxModsForPeptide = digestionParams.MaxModsForPeptide; @@ -68,7 +69,7 @@ internal IEnumerable GetModifiedPeptides(List GetModifiedPeptides(List GetModifiedPeptides(List GetModifiedPeptides(List GetModifiedPeptides(List kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) @@ -171,29 +171,5 @@ internal IEnumerable GetModifiedPeptides(List - /// Determines whether given modification can be an N-terminal modification - /// - /// - /// - /// - private bool CanBeNTerminalMod(Modification variableModification, int peptideLength) - { - return ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, 1, peptideLength, OneBasedStartResidue) - && (variableModification.LocationRestriction == "N-terminal." || variableModification.LocationRestriction == "Peptide N-terminal."); - } - - /// - /// Determines whether given modification can be a C-terminal modification - /// - /// - /// - /// - private bool CanBeCTerminalMod(Modification variableModification, int peptideLength) - { - return ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1) - && (variableModification.LocationRestriction == "C-terminal." || variableModification.LocationRestriction == "Peptide C-terminal."); - } } } \ No newline at end of file diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index 180cb35af..9e28b57c6 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -49,6 +49,7 @@ public override string ToString() internal IEnumerable GenerateModifiedOligos(List allKnownFixedMods, RnaDigestionParams digestionParams, List variableModifications) { + int variableModificationIsoforms = 0; int oligoLength = OneBasedEndResidue - OneBasedStartResidue + 1; int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; int maxModsForOligo = digestionParams.MaxMods; @@ -67,7 +68,7 @@ internal IEnumerable GenerateModifiedOligos(List foreach (Modification variableModification in variableModifications) { // Check if can be a 5'-term mod - if (CanBeFivePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, 1, variableModification)) + if (CanBeNTerminalOrFivePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, 1, variableModification)) { fivePrimeVariableMods.Add(variableModification); } @@ -90,7 +91,7 @@ internal IEnumerable GenerateModifiedOligos(List } } // Check if can be a 3'-term mod - if (CanBeThreePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, oligoLength, variableModification)) + if (CanBeCTerminalOrThreePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, oligoLength, variableModification)) { threePrimeVariableMods.Add(variableModification); } @@ -111,7 +112,7 @@ internal IEnumerable GenerateModifiedOligos(List if (modWithMass is Modification variableModification) { // Check if can be a 5'-term mod - if (locInPeptide == 1 && CanBeFivePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + if (locInPeptide == 1 && CanBeNTerminalOrFivePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) { fivePrimeVariableMods.Add(variableModification); } @@ -134,7 +135,7 @@ internal IEnumerable GenerateModifiedOligos(List } // Check if can be a 3'-term mod - if (locInPeptide == oligoLength && CanBeThreePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + if (locInPeptide == oligoLength && CanBeCTerminalOrThreePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) { threePrimeVariableMods.Add(variableModification); } @@ -142,7 +143,6 @@ internal IEnumerable GenerateModifiedOligos(List } } - int variableModificationIsoforms = 0; PopulateFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, ref fixedModDictionary); // Add the mods to the oligo by return numerous OligoWithSetMods @@ -172,17 +172,5 @@ internal IEnumerable GenerateModifiedOligos(List FixedModDictionaryPool.Return(fixedModDictionary); } } - - private bool CanBeFivePrime(Modification variableModification, int peptideLength) - { - return (variableModification.LocationRestriction == "5'-terminal." || variableModification.LocationRestriction == "Oligo 5'-terminal.") - && ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, 1, peptideLength, OneBasedStartResidue); - } - - private bool CanBeThreePrime(Modification variableModification, int peptideLength) - { - return (variableModification.LocationRestriction == "3'-terminal." || variableModification.LocationRestriction == "Oligo 3'-terminal.") - && ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1); - } } } From 0b59b0b358fd34bc29a857982a59a0aebac368ad Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 15 Jan 2025 18:09:17 -0600 Subject: [PATCH 16/21] Extracted all variable modification combination generation to parent class --- mzLib/Omics/Digestion/DigestionProduct.cs | 115 +++++++++++++++++- .../ProteolyticPeptide.cs | 84 +------------ .../Digestion/NucleolyticOligo.cs | 86 +------------ 3 files changed, 111 insertions(+), 174 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index e7eb697bd..194b1d3e3 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -1,5 +1,6 @@ using MzLibUtil; using Omics.Modifications; +using System.Runtime.CompilerServices; namespace Omics.Digestion { @@ -39,6 +40,8 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one public int Length => BaseSequence.Length; //how many residues long the peptide is public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex]; + #region Digestion Helper Methods + /// /// Generates all possible variable modification patterns for a peptide. /// @@ -63,7 +66,7 @@ protected static IEnumerable> GetVariableModificat for (int variable_modifications = 0; variable_modifications <= maxVariableMods; variable_modifications++) { - foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications.ToList(), + foreach (int[] variable_modification_pattern in GetVariableModificationPatternsRecursive(possibleVariableModifications.ToList(), possibleVariableModifications.Count - variable_modifications, baseVariableModificationPattern, 0)) { // use modification pattern to construct a dictionary of modifications for the peptide @@ -150,6 +153,104 @@ protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length, } } + /// + /// Populates the variable modifications dictionary from both the variable modifications and the localized mods from xml reading, + /// considering the N-terminal, C-terminal, and internal positions. + /// + /// A list of all variable modifications. + /// A reference to a dictionary that will hold the variable modifications, with the key representing the position. + /// + /// This method iterates through all variable modifications and assigns them to the appropriate positions in the peptide. + /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. + /// + protected void PopulateVariableModifications(List allVariableMods, ref Dictionary> twoBasedDictToPopulate) + { + int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; + var pepNTermVariableMods = new List(); + twoBasedDictToPopulate.Add(1, pepNTermVariableMods); + + var pepCTermVariableMods = new List(); + twoBasedDictToPopulate.Add(peptideLength + 2, pepCTermVariableMods); + + // VARIABLE MODS + foreach (Modification variableModification in allVariableMods) + { + // Check if can be a n-term mod + if (CanBeNTerminalOrFivePrime(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Parent, 1, variableModification)) + { + pepNTermVariableMods.Add(variableModification); + } + + for (int r = 0; r < peptideLength; r++) + { + if (ModificationLocalization.ModFits(variableModification, Parent.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Parent, r + 1, variableModification)) + { + if (!twoBasedDictToPopulate.TryGetValue(r + 2, out var residueVariableMods)) + { + residueVariableMods = new List() { variableModification }; + twoBasedDictToPopulate.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + } + // Check if can be a c-term mod + if (CanBeCTerminalOrThreePrime(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Parent, peptideLength, variableModification)) + { + pepCTermVariableMods.Add(variableModification); + } + } + + // LOCALIZED MODS + foreach (var kvp in Parent.OneBasedPossibleLocalizedModifications) + { + bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; + if (!inBounds) + { + continue; + } + + int locInPeptide = kvp.Key - OneBasedStartResidue + 1; + foreach (Modification modWithMass in kvp.Value) + { + if (modWithMass is not Modification variableModification) + continue; + + // Check if can be a n-term mod + if (locInPeptide == 1 && CanBeNTerminalOrFivePrime(variableModification, peptideLength) && !Parent.IsDecoy) + { + pepNTermVariableMods.Add(variableModification); + } + + int r = locInPeptide - 1; + if (r >= 0 && r < peptideLength + && (Parent.IsDecoy || + (ModificationLocalization.ModFits(variableModification, Parent.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere."))) + { + if (!twoBasedDictToPopulate.TryGetValue(r + 2, out var residueVariableMods)) + { + residueVariableMods = new List() { variableModification }; + twoBasedDictToPopulate.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + + // Check if can be a c-term mod + if (locInPeptide == peptideLength && CanBeCTerminalOrThreePrime(variableModification, peptideLength) && !Parent.IsDecoy) + { + pepCTermVariableMods.Add(variableModification); + } + } + } + } + /// /// Recursively generates all possible variable modification patterns for a peptide. /// @@ -164,7 +265,7 @@ protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length, /// This method uses recursion to generate all possible combinations of variable modifications for a given peptide. /// It considers both modified and unmodified residues and generates patterns accordingly. /// - private static IEnumerable GetVariableModificationPatterns(List>> possibleVariableModifications, + private static IEnumerable GetVariableModificationPatternsRecursive(List>> possibleVariableModifications, int unmodifiedResiduesDesired, int[] variableModificationPattern, int index) { if (index < possibleVariableModifications.Count - 1) @@ -172,7 +273,7 @@ private static IEnumerable GetVariableModificationPatterns(List 0) { variableModificationPattern[possibleVariableModifications[index].Key] = 0; - foreach (int[] new_variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications, + foreach (int[] new_variable_modification_pattern in GetVariableModificationPatternsRecursive(possibleVariableModifications, unmodifiedResiduesDesired - 1, variableModificationPattern, index + 1)) { yield return new_variable_modification_pattern; @@ -183,7 +284,7 @@ private static IEnumerable GetVariableModificationPatterns(List GetVariableModificationPatterns(ListThe modification to check. /// The length of the peptide. /// True if the modification can be applied to the N-terminal or 5' end; otherwise, false. - protected bool CanBeNTerminalOrFivePrime(Modification mod, int peptideLength) + private bool CanBeNTerminalOrFivePrime(Modification mod, int peptideLength) { return mod.LocationRestriction is "5'-terminal." or "Oligo 5'-terminal." or "N-terminal." or "Peptide N-terminal." && ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, peptideLength, OneBasedStartResidue); @@ -227,10 +328,12 @@ protected bool CanBeNTerminalOrFivePrime(Modification mod, int peptideLength) /// The modification to check. /// The length of the peptide. /// True if the modification can be applied to the C-terminal or 3' end; otherwise, false. - protected bool CanBeCTerminalOrThreePrime(Modification mod, int peptideLength) + private bool CanBeCTerminalOrThreePrime(Modification mod, int peptideLength) { return mod.LocationRestriction is "3'-terminal." or "Oligo 3'-terminal." or "C-terminal." or "Peptide C-terminal." && ModificationLocalization.ModFits(mod, Parent.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1); } + + #endregion } } diff --git a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs index 8bb64157b..243f92f5e 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -60,89 +60,7 @@ internal IEnumerable GetModifiedPeptides(List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(1, pepNTermVariableMods); - - var pepCTermVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(peptideLength + 2, pepCTermVariableMods); - - foreach (Modification variableModification in variableModifications) - { - // Check if can be a n-term mod - if (CanBeNTerminalOrFivePrime(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, 1, variableModification)) - { - pepNTermVariableMods.Add(variableModification); - } - - for (int r = 0; r < peptideLength; r++) - { - if (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) - && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Protein, r + 1, variableModification)) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - } - // Check if can be a c-term mod - if (CanBeCTerminalOrThreePrime(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, peptideLength, variableModification)) - { - pepCTermVariableMods.Add(variableModification); - } - } - - // LOCALIZED MODS - foreach (var kvp in Protein.OneBasedPossibleLocalizedModifications) - { - bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; - if (!inBounds) - { - continue; - } - - int locInPeptide = kvp.Key - OneBasedStartResidueInProtein + 1; - foreach (Modification modWithMass in kvp.Value) - { - if (modWithMass is Modification variableModification) - { - // Check if can be a n-term mod - if (locInPeptide == 1 && CanBeNTerminalOrFivePrime(variableModification, peptideLength) && !Protein.IsDecoy) - { - pepNTermVariableMods.Add(variableModification); - } - - int r = locInPeptide - 1; - if (r >= 0 && r < peptideLength - && (Protein.IsDecoy || - (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidueInProtein + r) - && variableModification.LocationRestriction == "Anywhere."))) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - - // Check if can be a c-term mod - if (locInPeptide == peptideLength && CanBeCTerminalOrThreePrime(variableModification, peptideLength) && !Protein.IsDecoy) - { - pepCTermVariableMods.Add(variableModification); - } - } - } - } - + PopulateVariableModifications(variableModifications, ref twoBasedPossibleVariableAndLocalizeableModifications); PopulateFixedModsOneIsNorFivePrimeTerminus(peptideLength, allKnownFixedModifications, ref fixedModDictionary); foreach (Dictionary kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index 9e28b57c6..4dfe35871 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -58,91 +58,7 @@ internal IEnumerable GenerateModifiedOligos(List try { - var fivePrimeVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(1, fivePrimeVariableMods); - - var threePrimeVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(oligoLength + 2, threePrimeVariableMods); - - // collect all possible variable mods, skipping if there is a database annotated modification - foreach (Modification variableModification in variableModifications) - { - // Check if can be a 5'-term mod - if (CanBeNTerminalOrFivePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, 1, variableModification)) - { - fivePrimeVariableMods.Add(variableModification); - } - - for (int r = 0; r < oligoLength; r++) - { - if (variableModification.LocationRestriction == "Anywhere." && - ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) - && !ModificationLocalization.UniprotModExists(NucleicAcid, r + 1, variableModification)) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - } - // Check if can be a 3'-term mod - if (CanBeCTerminalOrThreePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, oligoLength, variableModification)) - { - threePrimeVariableMods.Add(variableModification); - } - } - - // collect all localized modifications from the database. - foreach (var kvp in NucleicAcid.OneBasedPossibleLocalizedModifications) - { - bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; - if (!inBounds) - { - continue; - } - - int locInPeptide = kvp.Key - OneBasedStartResidue + 1; - foreach (Modification modWithMass in kvp.Value) - { - if (modWithMass is Modification variableModification) - { - // Check if can be a 5'-term mod - if (locInPeptide == 1 && CanBeNTerminalOrFivePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) - { - fivePrimeVariableMods.Add(variableModification); - } - - int r = locInPeptide - 1; - if (r >= 0 && r < oligoLength - && (NucleicAcid.IsDecoy || - (ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) - && variableModification.LocationRestriction == "Anywhere."))) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - - // Check if can be a 3'-term mod - if (locInPeptide == oligoLength && CanBeCTerminalOrThreePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) - { - threePrimeVariableMods.Add(variableModification); - } - } - } - } - + PopulateVariableModifications(variableModifications, ref twoBasedPossibleVariableAndLocalizeableModifications); PopulateFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, ref fixedModDictionary); // Add the mods to the oligo by return numerous OligoWithSetMods From 9096c217838459e713c40bc6b0e2a34820366f9f Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 15 Jan 2025 18:13:38 -0600 Subject: [PATCH 17/21] removed fixed mods changes --- .../Modifications/ModificationLocalization.cs | 4 +-- mzLib/Test/TestProteinDigestion.cs | 35 ++----------------- 2 files changed, 4 insertions(+), 35 deletions(-) diff --git a/mzLib/Omics/Modifications/ModificationLocalization.cs b/mzLib/Omics/Modifications/ModificationLocalization.cs index 8928d8c9a..e2c57fa2d 100644 --- a/mzLib/Omics/Modifications/ModificationLocalization.cs +++ b/mzLib/Omics/Modifications/ModificationLocalization.cs @@ -32,9 +32,9 @@ public static bool ModFits(Modification attemptToLocalize, string sequence, int switch (attemptToLocalize.LocationRestriction) { case "N-terminal." when bioPolymerOneBasedIndex > 2: - case "Peptide N-terminal." when digestionProductOneBasedIndex > 1 || bioPolymerOneBasedIndex == 1: + case "Peptide N-terminal." when digestionProductOneBasedIndex > 1: case "C-terminal." when bioPolymerOneBasedIndex < sequence.Length: - case "Peptide C-terminal." when digestionProductOneBasedIndex < digestionProductLength || bioPolymerOneBasedIndex == sequence.Length: + case "Peptide C-terminal." when digestionProductOneBasedIndex < digestionProductLength: case "5'-terminal." when bioPolymerOneBasedIndex > 2: // first residue in oligo but not first in nucleic acid case "Oligo 5'-terminal." when digestionProductOneBasedIndex > 1 diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index 821288dae..02cc3aed5 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -220,7 +220,7 @@ public static void TestPeptideWithSetModifications() variableModifications.Add(new Modification(_originalId: "ProtCmod", _target: motif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); var ye = prot.Digest(digestionParams, new List(), variableModifications).ToList(); - Assert.AreEqual(2 * 2 * 2, ye.Count); + Assert.AreEqual(3 * 2 * 3, ye.Count); Assert.AreEqual("[H]M[H][H]", ye.Last().SequenceWithChemicalFormulas); double m1 = 5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass; @@ -251,43 +251,12 @@ public static void TestPeptideWithFixedModifications() Assert.AreEqual(1, ok.Count); - Assert.AreEqual("[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence); + Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence); Assert.AreEqual("[H]M[H][H]", ok.First().SequenceWithChemicalFormulas); Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9); } - [Test] - public static void TestPeptideWithFixedModifications_TwoProducts() - { - var prot = new Protein("MKM", null); - DigestionParams digestionParams = new DigestionParams(maxMissedCleavages: 0, minPeptideLength: 1, maxModsForPeptides: 3, initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); - List fixedMods = new List(); - ModificationMotif.TryGetMotif("M", out ModificationMotif mMotif); - ModificationMotif.TryGetMotif("K", out ModificationMotif kMotif); - - fixedMods.Add(new Modification(_originalId: "ProtNmod", _target: mMotif, _locationRestriction: "N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "ProtNmod", _target: kMotif, _locationRestriction: "N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "pepNmod", _target: mMotif, _locationRestriction: "Peptide N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "pepNmod", _target: kMotif, _locationRestriction: "Peptide N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "resMod", _target: mMotif, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "PepCmod", _target: mMotif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "PepCmod", _target: kMotif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "ProtCmod", _target: mMotif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "ProtCmod", _target: kMotif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - - var ok = prot.Digest(digestionParams, fixedMods, new List()).ToList(); - - Assert.AreEqual(2, ok.Count); - - Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K[:PepCmod on K]", ok.First().FullSequence); - Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.Skip(1).First().FullSequence); - - Assert.AreEqual("[H]M[H]K[H]", ok.First().SequenceWithChemicalFormulas); - Assert.AreEqual("[H]M[H][H]", ok.Skip(1).First().SequenceWithChemicalFormulas); - Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9); - } - [Test] public static void TestDigestIndices() { From 010f93fc6ffeff708660f1852717e1e07ecb6ce7 Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 15 Jan 2025 18:20:58 -0600 Subject: [PATCH 18/21] removed unnecessary namespace --- mzLib/Omics/Digestion/DigestionProduct.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index 194b1d3e3..4cad95a14 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -1,6 +1,5 @@ using MzLibUtil; using Omics.Modifications; -using System.Runtime.CompilerServices; namespace Omics.Digestion { From 4e93b361198f7d1be07000670195eba296f3749d Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 15 Jan 2025 18:35:57 -0600 Subject: [PATCH 19/21] Extracted AppendFixedToVariabel --- mzLib/Omics/Digestion/DigestionProduct.cs | 27 +++++++++++++++++-- .../ProteolyticPeptide.cs | 20 +++++--------- .../Digestion/NucleolyticOligo.cs | 16 ++++------- 3 files changed, 37 insertions(+), 26 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index 4cad95a14..e653dd62f 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -96,7 +96,7 @@ protected static IEnumerable> GetVariableModificat /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. /// protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length, - IEnumerable allKnownFixedModifications, ref Dictionary fixedModsOneIsNterminus) + IEnumerable allKnownFixedModifications, in Dictionary fixedModsOneIsNterminus) { foreach (Modification mod in allKnownFixedModifications) { @@ -162,7 +162,7 @@ protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length, /// This method iterates through all variable modifications and assigns them to the appropriate positions in the peptide. /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. /// - protected void PopulateVariableModifications(List allVariableMods, ref Dictionary> twoBasedDictToPopulate) + protected void PopulateVariableModifications(List allVariableMods, in Dictionary> twoBasedDictToPopulate) { int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; var pepNTermVariableMods = new List(); @@ -250,6 +250,29 @@ protected void PopulateVariableModifications(List allVariableMods, } } + /// + /// Appends fixed modifications to the variable modification pattern when no variable mod exists. + /// + /// The dictionary containing fixed modifications. + /// The dictionary containing the variable modification pattern. + /// The number of fixed modifications appended. + /// + /// This method iterates through the fixed modifications and adds them to the variable modification pattern + /// if they are not already present. The number of fixed modifications appended is returned via the out parameter. + /// + protected void AppendFixedModificationsToVariable(in Dictionary fixedModDict, in Dictionary variableModPattern, out int numFixedMods) + { + numFixedMods = 0; + foreach (var fixedModPattern in fixedModDict) + { + if (variableModPattern.ContainsKey(fixedModPattern.Key)) + continue; + + numFixedMods++; + variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); + } + } + /// /// Recursively generates all possible variable modification patterns for a peptide. /// diff --git a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs index 243f92f5e..615a3618d 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -60,22 +60,16 @@ internal IEnumerable GetModifiedPeptides(List kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) + foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) { - int numFixedMods = 0; - foreach (var ok in fixedModDictionary) - { - if (!kvp.ContainsKey(ok.Key)) - { - numFixedMods++; - kvp.Add(ok.Key, ok.Value); - } - } + AppendFixedModificationsToVariable(in fixedModDictionary, in variableModPattern, out int numFixedMods); + yield return new PeptideWithSetModifications(Protein, digestionParams, OneBasedStartResidue, OneBasedEndResidue, - CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, kvp, numFixedMods); + CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, variableModPattern, numFixedMods); + variable_modification_isoforms++; if (variable_modification_isoforms == maximumVariableModificationIsoforms) { diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs index 4dfe35871..767fb1564 100644 --- a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -58,23 +58,17 @@ internal IEnumerable GenerateModifiedOligos(List try { - PopulateVariableModifications(variableModifications, ref twoBasedPossibleVariableAndLocalizeableModifications); - PopulateFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, ref fixedModDictionary); + PopulateVariableModifications(variableModifications, in twoBasedPossibleVariableAndLocalizeableModifications); + PopulateFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, in fixedModDictionary); // Add the mods to the oligo by return numerous OligoWithSetMods foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) { - int numFixedMods = 0; - foreach (var fixedModPattern in fixedModDictionary) - { - if (!variableModPattern.ContainsKey(fixedModPattern.Key)) - { - numFixedMods++; - variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); - } - } + AppendFixedModificationsToVariable(in fixedModDictionary, in variableModPattern, out int numFixedMods); + yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, CleavageSpecificityForFdrCategory, variableModPattern, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); + variableModificationIsoforms++; if (variableModificationIsoforms == maximumVariableModificationIsoforms) { From 2171a73235199f4463c198884a35155d51ad767e Mon Sep 17 00:00:00 2001 From: Nic Bollis Date: Wed, 15 Jan 2025 18:45:10 -0600 Subject: [PATCH 20/21] Update mzLib.nuspec --- mzLib/mzLib.nuspec | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index b809b83cf..b0b4c3045 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 5.2.44 + 1.0.547 mzLib Stef S. Stef S. @@ -95,4 +95,4 @@ - \ No newline at end of file + From bff563e964f582e0316d415ca24dc278af6408bb Mon Sep 17 00:00:00 2001 From: nbollis Date: Thu, 16 Jan 2025 12:38:17 -0600 Subject: [PATCH 21/21] Added shortreed comment --- mzLib/Omics/Digestion/DigestionProduct.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index e653dd62f..13fe51610 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -42,7 +42,7 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one #region Digestion Helper Methods /// - /// Generates all possible variable modification patterns for a peptide. + /// Generates all possible variable modification patterns for a peptide, which includes variable and localized modifications but excludes fixed mods /// /// A dictionary of possible variable modifications with their positions. /// The maximum number of modifications allowed for the peptide.