From c73669112b9fb41a51f15fd404c479b50f58bf98 Mon Sep 17 00:00:00 2001 From: nbollis Date: Wed, 8 Jan 2025 22:02:10 -0600 Subject: [PATCH] made it go all the way through --- .../SearchTask/PostSearchAnalysisTask.cs | 307 ++++++++++-------- 1 file changed, 166 insertions(+), 141 deletions(-) diff --git a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs index 9f1b2e95e9..fcaf47c915 100644 --- a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs +++ b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs @@ -23,6 +23,7 @@ using Omics.Modifications; using Omics.SpectrumMatch; using Omics; +using ThermoFisher.CommonCore.Data; namespace TaskLayer { @@ -1014,56 +1015,31 @@ private void WriteFlashLFQResults() #region Pruned Database Writing + // This method works by replacing the modifications on the Proteins, writing the database, and restoring those modifications private void WritePrunedDatabase() { Status("Writing Pruned Database...", new List { Parameters.SearchTaskId }); + // find all biopolymers that have at least one confident PSM -> Used in Protein+Mod Pruned Database only var proteinToConfidentBaseSequences = GetProteinToConfidentBaseSequences(Parameters.AllPsms); - var proteinToConfidentModifiedSequences = GetProteinToConfidentModifiedSequences(Parameters.AllPsms, Parameters.SearchParameters.EvidenceRequiredToWriteLocalizedMod); - var proteinsOriginalModifications = new Dictionary>>(); - var originalSequenceVariantModifications = new Dictionary>>(); + // find all biopolymers that have at least one confident PSM and their confident localized modifications -> Used for determining which mods to retain + var proteinToConfidentModifiedSequences = GetProteinToConfidentModifiedSequences(Parameters.AllPsms, Parameters.SearchParameters.EvidenceRequiredToWriteLocalizedMod, Parameters.SearchParameters.IncludeProteinAmbiguous); - // populate the protein object with the desired modifications - UpdateProteinModifications(proteinToConfidentModifiedSequences, proteinsOriginalModifications, originalSequenceVariantModifications); + // populate the protein object with the desired modifications with a modify in place operation, original modifications are stored for later restoration + UpdateProteinModifications(proteinToConfidentModifiedSequences, out var proteinsOriginalModifications, out var originalSequenceVariantModifications); - var writers = WriteDatabases(proteinToConfidentBaseSequences); - var cleanupTask = new Task(() => - { - // Clean up - foreach (var nonVariantProtein in Parameters.ProteinList.Select(p => p.NonVariantProtein).Distinct()) - { - if (!nonVariantProtein.IsDecoy) - { - nonVariantProtein.OneBasedPossibleLocalizedModifications.Clear(); - foreach (var originalMod in proteinsOriginalModifications[nonVariantProtein.NonVariantProtein]) - { - nonVariantProtein.OneBasedPossibleLocalizedModifications.Add(originalMod.Key, originalMod.Value); - } - foreach (var sv in nonVariantProtein.SequenceVariations) - { - sv.OneBasedModifications.Clear(); - foreach (var originalVariantMods in originalSequenceVariantModifications[sv]) - { - sv.OneBasedModifications.Add(originalVariantMods.Key, originalVariantMods.Value); - } - } - } - } - }); - - // wait for all writing to stop, then restore proteins to their original state - var finalTaskForWriting = Task.WhenAll(writers).ContinueWith(t => cleanupTask.Start()); + WriteDatabases(proteinToConfidentBaseSequences); - // TODO: Return this and wait before exiting post search analysis if the cleanup is independent of subsequent processing - finalTaskForWriting.Wait(); + // Restore Original Modifications with a modify in place operation + RestoreOriginalModifications(in proteinsOriginalModifications, in originalSequenceVariantModifications); } /// /// Associate all confident PSMs with all possible proteins they could be digest products of (before or after parsimony) /// /// List of all PSMs. - /// A dictionary where the key is a protein and the value is all confidently identified species from that protein with an unambiguous base sequences. + /// A dictionary where the peptideModIndex is a protein and the value is all confidently identified species from that protein with an unambiguous base sequences. private Dictionary> GetProteinToConfidentBaseSequences(List allPsms) { var filteredPsms = FilteredPsms.Filter(allPsms, @@ -1074,7 +1050,6 @@ private Dictionary> GetProteinToConfid includeHighQValuePsms: false); var proteinToConfidentBaseSequences = new Dictionary>(); - foreach (SpectralMatch psm in filteredPsms) { foreach (var (_, bioPolymer) in psm.BestMatchingBioPolymersWithSetMods) @@ -1096,8 +1071,8 @@ private Dictionary> GetProteinToConfid /// /// List of all PSMs. /// - /// A dictionary where the key is a protein and the value is all confidently identified species from that protein with an unambiguous modified sequences. - public Dictionary> GetProteinToConfidentModifiedSequences(List allPsms, uint evidenceRequired = 1) + /// A dictionary where the peptideModIndex is a protein and the value is all confidently identified species from that protein with an unambiguous modified sequences. + public Dictionary> GetProteinToConfidentModifiedSequences(List allPsms, uint evidenceRequired = 1, bool includeProteinAmbiguous = false) { // set up and filter spectral matches var fileSpecificParametersDictionary = FileSpecificParameters.ToDictionary(p => p.FileName, p => p.Parameters); @@ -1110,7 +1085,7 @@ public Dictionary> GetProteinToConfide includeHighQValuePsms: false); // aggregate spectral matches by biopolymer and record the dissociation type and digestion agent - Dictionary> initialAggregation = new(); + Dictionary> initialAggregation = new(); foreach (SpectralMatch psm in originalModPsms) { fileSpecificParametersDictionary.TryGetValue(psm.FullFilePath, out var fileSpecificParameters); @@ -1128,84 +1103,108 @@ public Dictionary> GetProteinToConfide // One piece of evidence (the default) is simply making the identification. We can then bypass the large calculation below. - var filteredProteinToConfidentModifiedSequences = new Dictionary>(); if (evidenceRequired == 1) - filteredProteinToConfidentModifiedSequences = initialAggregation.ToDictionary(p => p.Key, p => p.Value.Select(v => v.Item1).ToList()); - else + return initialAggregation.ToDictionary(p => p.Key, p => p.Value.Select(v => v.BioPolymerWithSetMods).ToList()); + + var filteredProteinToConfidentModifiedSequences = new Dictionary>(initialAggregation.Count); + var minimumSet = new HashSet(16); + var modificationsToRetain = new HashSet<(int Position, Modification Modification)>(16); + + + foreach (var proteinGroup in initialAggregation) { - foreach (var proteinGroup in initialAggregation) + minimumSet.Clear(); + modificationsToRetain.Clear(); + var protein = proteinGroup.Key; + + // Extract confident modifications that have enough evidence, add to mods to retain + foreach (var modAndLocationGrouped in proteinGroup.Value + .Where(p => p.BioPolymerWithSetMods.AllModsOneIsNterminus.Count > 0) + .SelectMany(withSetMods => withSetMods.BioPolymerWithSetMods.AllModsOneIsNterminus + .Where(p => !p.Value.ModificationType.Equals("Common Fixed")) + .Select(mod => + { + var peptideModIndex = mod.Key; + var modification = mod.Value; + var dissociationType = withSetMods.DissociationType; + var digestionAgent = withSetMods.DigestionAgent; + var startResidue = withSetMods.BioPolymerWithSetMods.OneBasedStartResidue; + var endResidue = withSetMods.BioPolymerWithSetMods.OneBasedEndResidue; + var proteinModIndex = peptideModIndex + startResidue - 1; + var missedCleavages = withSetMods.BioPolymerWithSetMods.MissedCleavages; + + // Add any intermediate processing here + return (proteinModIndex, peptideModIndex, modification, dissociationType, digestionAgent, startResidue, endResidue, missedCleavages); + })) + .GroupBy(p => (p.proteinModIndex, p.modification))) { - var protein = proteinGroup.Key; - var minimumSet = new List(); - var modificationsToRetain = new HashSet<(int Position, Modification Modification)>(); - - // Extract relevant modifications - foreach (var modAndLocationGrouped in proteinGroup.Value - .SelectMany(v => v.BioPolymer.AllModsOneIsNterminus.Select(m => - (m.Key, m.Value, v.DissociationType, v.DigestionAgent, v.BioPolymer.OneBasedStartResidue, v.BioPolymer.OneBasedEndResidue))) - .GroupBy(p => (p.Key, p.Value))) - { - var modGroup = modAndLocationGrouped.ToList(); + // Only one spectral match with this modification on this location + if (modAndLocationGrouped.Count() <= 1) + continue; - // Only one spectral match with this modification on this location - if (modGroup.Count <= 1) - continue; + var dissociationTypeCount = modAndLocationGrouped.GroupBy(p => p.dissociationType).Count(); + var digestionAgentCount = modAndLocationGrouped.GroupBy(p => p.digestionAgent).Count(); - // If the modification identity and position was consistent across evidenceRequired runs by digestion agent or dissociation type, keep it - if (modGroup.DistinctBy(p => p.DissociationType).Count() >= evidenceRequired || modGroup.DistinctBy(p => p.DigestionAgent).Count() >= evidenceRequired) - { - modificationsToRetain.Add(modAndLocationGrouped.Key); - continue; - } + // TODO: Ask Claire about truncy bois. Right now they dont count for anything as they are not true missed cleavages. + var missedCleavageCount = modAndLocationGrouped.GroupBy(p => p.dissociationType) + .Sum(dissGroup => dissGroup.GroupBy(p => p.missedCleavages).Count() - 1); - // If the modification identity and position was consistent across a missed cleavage product, keep it - var distinctTermini = modGroup.DistinctBy(p => (p.OneBasedStartResidue, p.OneBasedEndResidue)).ToArray(); - if (distinctTermini.Length > 1 && (distinctTermini.GroupBy(term => term.OneBasedStartResidue).Count() > 1 || distinctTermini.GroupBy(term => term.OneBasedEndResidue).Count() > 1)) - { - modificationsToRetain.Add(modAndLocationGrouped.Key); - } - } + var conditionCount = dissociationTypeCount + digestionAgentCount + missedCleavageCount - 2; + if (conditionCount >= evidenceRequired) + modificationsToRetain.Add(modAndLocationGrouped.Key); + } - // Sort biopolymers by the number of mods to exclude they cover - // Then by descending number of mods to include they cover - var sortedBioPolymers = proteinGroup.Value.Select(v => v.BioPolymer) - .Select(bioPolymer => new - { - BioPolymerWithSetMods = bioPolymer, - CoveredMods = modificationsToRetain - .Where(mod => - bioPolymer.AllModsOneIsNterminus.TryGetValue(mod.Position, out var bioPolymerMod) && - Equals(bioPolymerMod, mod.Modification)) - .ToHashSet() - }) - .OrderBy(covGroup => covGroup.BioPolymerWithSetMods.AllModsOneIsNterminus - .Count(mod => !modificationsToRetain.Contains((mod.Key, mod.Value)))) - .ThenByDescending(covGroup => covGroup.BioPolymerWithSetMods.AllModsOneIsNterminus - .Count(mod => modificationsToRetain.Contains((mod.Key, mod.Value)))) - .ToList(); + // No modifications to retain + if (modificationsToRetain.Count == 0) + { + filteredProteinToConfidentModifiedSequences[protein] = []; + continue; + } - while (modificationsToRetain.Count > 0 && sortedBioPolymers.Count > 0) + // Sort biopolymers by the number of mods to include they cover then by the number of mods to exclude they bring along + // The goal is to determine the minimum set of IBioPolymerWithSetMods to add which cover all mods to include while bringing the fewest mods to exclude along for the ride + var sortedBioPolymers = proteinGroup.Value.Select(v => v.BioPolymerWithSetMods) + .Where(p => p.AllModsOneIsNterminus.Count > 0) + .Select(bioPolymer => new { - var bestBioPolymer = sortedBioPolymers.First(); - minimumSet.Add(bestBioPolymer.BioPolymerWithSetMods); - foreach (var mod in bestBioPolymer.CoveredMods) - { - modificationsToRetain.Remove(mod); - } - sortedBioPolymers.RemoveAt(0); - sortedBioPolymers = sortedBioPolymers - .Where(covGroup => covGroup.CoveredMods.Overlaps(modificationsToRetain)) - .OrderBy(covGroup => covGroup.BioPolymerWithSetMods.AllModsOneIsNterminus - .Count(mod => modificationsToRetain.Contains((mod.Key, mod.Value)))) - .ThenByDescending(covGroup => covGroup.BioPolymerWithSetMods.AllModsOneIsNterminus - .Count(mod => modificationsToRetain.Contains((mod.Key, mod.Value)))) - .ToList(); - } + BioPolymerWithSetMods = bioPolymer, + + // collect mods that are covered by this biopolymer + CoveredMods = modificationsToRetain + .Where(mod => + bioPolymer.AllModsOneIsNterminus.TryGetValue(mod.Position - bioPolymer.OneBasedStartResidue + 1, out var bioPolymerMod) && + Equals(bioPolymerMod, mod.Modification)) + .ToHashSet() + }) + .OrderByDescending(covGroup => covGroup.CoveredMods.Count) + .ThenBy(covGroup => covGroup.BioPolymerWithSetMods.AllModsOneIsNterminus + .Count(mod => !modificationsToRetain.Contains((mod.Key - covGroup.BioPolymerWithSetMods.OneBasedStartResidue + 1, mod.Value)))) + .ToList(); + + + // iterate through the sorted list until we cover all modifications or use all biopolymers (we should never hit the second case, but stops and infinite loop just in case) + while (modificationsToRetain.Count > 0 && sortedBioPolymers.Count > 0) + { + var bestBioPolymer = sortedBioPolymers.First(); + + minimumSet.Add(bestBioPolymer.BioPolymerWithSetMods); + foreach (var mod in bestBioPolymer.CoveredMods) + modificationsToRetain.Remove(mod); + sortedBioPolymers.RemoveAt(0); + - filteredProteinToConfidentModifiedSequences[protein] = minimumSet; + sortedBioPolymers = sortedBioPolymers + .Where(covGroup => covGroup.CoveredMods.Overlaps(modificationsToRetain)) // retain only those with mods that are not yet covered + .OrderByDescending(covGroup => covGroup.CoveredMods.Count) + .ThenBy(covGroup => covGroup.BioPolymerWithSetMods.AllModsOneIsNterminus + .Count(mod => !modificationsToRetain.Contains((mod.Key - covGroup.BioPolymerWithSetMods.OneBasedStartResidue + 1, mod.Value)))) + .ToList(); } + + filteredProteinToConfidentModifiedSequences[protein] = minimumSet.ToList(); } + return filteredProteinToConfidentModifiedSequences; } @@ -1214,20 +1213,25 @@ public Dictionary> GetProteinToConfide /// Updates the protein modifications based on the confident modified sequences. /// /// Dictionary mapping proteins to their confident modified sequences. - /// Tuple containing sets of modifications to write based on different criteria. /// Dictionary to store the original modifications of proteins. /// Dictionary to store the original modifications of sequence variants. private void UpdateProteinModifications(Dictionary> proteinToConfidentModifiedSequences, - Dictionary>> proteinsOriginalModifications, - Dictionary>> originalSequenceVariantModifications) + out Dictionary>> proteinsOriginalModifications, + out Dictionary>> originalSequenceVariantModifications) { - var modificationsToWrite = PrunedDatabaseWriter.GetModificationsToWrite(Parameters.SearchParameters.ModsToWriteSelection); + proteinsOriginalModifications = new Dictionary>>(proteinToConfidentModifiedSequences.Count); + originalSequenceVariantModifications = new Dictionary>>(proteinToConfidentModifiedSequences.Count); + + var modificationsToWriteInThisSearch = PrunedDatabaseWriter.GetModificationsToWrite(Parameters.SearchParameters.ModsToWriteSelection); + HashSet<(int, Modification, SequenceVariation)> modsObservedOnThisProtein = new HashSet<(int, Modification, SequenceVariation)>(); + IDictionary<(SequenceVariation, int), List> modsToWriteOnThisProtein = new Dictionary<(SequenceVariation, int), List>(); foreach (var nonVariantProtein in Parameters.ProteinList.Select(p => p.NonVariantProtein).Distinct()) { if (nonVariantProtein.IsDecoy) continue; - HashSet<(int, Modification, SequenceVariation)> modsObservedOnThisProtein = new HashSet<(int, Modification, SequenceVariation)>(); + modsToWriteOnThisProtein.Clear(); + modsObservedOnThisProtein.Clear(); proteinToConfidentModifiedSequences.TryGetValue(nonVariantProtein, out var bioPolymersWithSetMods); foreach (var bioPolymerWithSetMods in bioPolymersWithSetMods ?? []) @@ -1245,22 +1249,22 @@ private void UpdateProteinModifications(Dictionary> modsToWrite = new Dictionary<(SequenceVariation, int), List>(); //Add if observed (regardless if in database) foreach (var observedMod in modsObservedOnThisProtein - .Where(observedMod => modificationsToWrite.modificationsToWriteIfObserved.Contains(observedMod.Item2))) + .Where(observedMod => modificationsToWriteInThisSearch.modificationsToWriteIfObserved.Contains(observedMod.Item2))) { - modsToWrite.AddOrCreate((observedMod.Item3, observedMod.Item1), observedMod.Item2); + modsToWriteOnThisProtein.AddOrCreate((observedMod.Item3, observedMod.Item1), observedMod.Item2); } // Add modification if in database (two cases: always or if observed) foreach (var modkv in nonVariantProtein.OneBasedPossibleLocalizedModifications) { - foreach (var mod in modkv.Value.Where(mod => modificationsToWrite.modificationsToWriteIfInDatabase.Contains(mod) || - (modificationsToWrite.modificationsToWriteIfBoth.Contains(mod) && modsObservedOnThisProtein.Contains((modkv.Key, mod, null))))) + foreach (var mod in modkv.Value.Where(mod => + modificationsToWriteInThisSearch.modificationsToWriteIfInDatabase.Contains(mod) || + (modificationsToWriteInThisSearch.modificationsToWriteIfBoth.Contains(mod) && modsObservedOnThisProtein.Contains((modkv.Key, mod, null))))) { - modsToWrite.AddOrCreate((null, modkv.Key), mod); + modsToWriteOnThisProtein.AddOrCreate((null, modkv.Key), mod); } } @@ -1273,17 +1277,10 @@ private void UpdateProteinModifications(Dictionary { mod }); - } - else - { - modsToWrite[(sv, modkv.Key)].Add(mod); - } + modsToWriteOnThisProtein.AddOrCreate((sv, modkv.Key), mod); } } } @@ -1306,7 +1303,7 @@ private void UpdateProteinModifications(Dictionary kv.Key.Item1 == null)) + foreach (var kvp in modsToWriteOnThisProtein.Where(kv => kv.Key.Item1 == null)) { nonVariantProtein.OneBasedPossibleLocalizedModifications.Add(kvp.Key.Item2, kvp.Value); } @@ -1333,7 +1330,7 @@ private void UpdateProteinModifications(Dictionary kv.Key.Item1 != null && kv.Key.Item1.Equals(sv))) + foreach (var kvp in modsToWriteOnThisProtein.Where(kv => kv.Key.Item1 != null && kv.Key.Item1.Equals(sv))) { sv.OneBasedModifications.Add(kvp.Key.Item2, kvp.Value); } @@ -1341,40 +1338,68 @@ private void UpdateProteinModifications(Dictionary WriteDatabases(Dictionary> proteinToConfidentBaseSequences) + private void WriteDatabases(Dictionary> proteinToConfidentBaseSequences) { - List databaseWritingTasks = []; List nestedIds = [Parameters.SearchTaskId]; if (Parameters.DatabaseFilenameList.Any(p => p.IsContaminant)) { + // all proteins, pruned mods string outputXMLdbFullNameContaminants = Path.Combine(Parameters.OutputFolder, string.Join("-", Parameters.DatabaseFilenameList.Where(b => b.IsContaminant).Select(b => Path.GetFileNameWithoutExtension(b.FilePath))) + "pruned.xml"); - var prunedProteins = Parameters.ProteinList.Select(p => p.NonVariantProtein).Where(b => !b.IsDecoy && b.IsContaminant).ToList(); - databaseWritingTasks.Add(PrunedDatabaseWriter.WriteDatabaseAsync(outputXMLdbFullNameContaminants, prunedProteins, nestedIds)); - + var prunedProteins = Parameters.ProteinList.Select(p => p.NonVariantProtein) + .Where(b => !b.IsDecoy && b.IsContaminant) + .ToList(); + PrunedDatabaseWriter.WriteDatabase(outputXMLdbFullNameContaminants, prunedProteins, nestedIds); + // pruned mods and proteins string outputXMLdbFullNameContaminantsProteinPruned = Path.Combine(Parameters.OutputFolder, string.Join("-", Parameters.DatabaseFilenameList.Where(b => b.IsContaminant).Select(b => Path.GetFileNameWithoutExtension(b.FilePath))) + "proteinPruned.xml"); - var proteinPrunedProteins = proteinToConfidentBaseSequences.Keys.Where(b => !b.IsDecoy && b.IsContaminant).ToList(); - databaseWritingTasks.Add(PrunedDatabaseWriter.WriteDatabaseAsync(outputXMLdbFullNameContaminantsProteinPruned, proteinPrunedProteins)); + var proteinPrunedProteins = proteinToConfidentBaseSequences.Keys + .Where(b => !b.IsDecoy && b.IsContaminant) + .ToList(); + PrunedDatabaseWriter.WriteDatabase(outputXMLdbFullNameContaminantsProteinPruned, proteinPrunedProteins, nestedIds); } if (Parameters.DatabaseFilenameList.Any(b => !b.IsContaminant)) { + // all proteins, pruned mods string outputXMLdbFullName = Path.Combine(Parameters.OutputFolder, string.Join("-", Parameters.DatabaseFilenameList.Where(b => !b.IsContaminant).Select(b => Path.GetFileNameWithoutExtension(b.FilePath))) + "pruned.xml"); - var prunedProteins = Parameters.ProteinList.Select(p => p.NonVariantProtein).Where(b => !b.IsDecoy && !b.IsContaminant).ToList(); - databaseWritingTasks.Add(PrunedDatabaseWriter.WriteDatabaseAsync(outputXMLdbFullName, prunedProteins)); - - + var prunedProteins = Parameters.ProteinList.Select(p => p.NonVariantProtein) + .Where(b => !b.IsDecoy && !b.IsContaminant) + .ToList(); + PrunedDatabaseWriter.WriteDatabase(outputXMLdbFullName, prunedProteins, nestedIds); + // pruned mods and proteins string outputXMLdbFullNameProteinPruned = Path.Combine(Parameters.OutputFolder, string.Join("-", Parameters.DatabaseFilenameList.Where(b => !b.IsContaminant).Select(b => Path.GetFileNameWithoutExtension(b.FilePath))) + "proteinPruned.xml"); - var proteinPrunedProteins = proteinToConfidentBaseSequences.Keys.Where(b => !b.IsDecoy && !b.IsContaminant).ToList(); - databaseWritingTasks.Add(PrunedDatabaseWriter.WriteDatabaseAsync(outputXMLdbFullNameProteinPruned, proteinPrunedProteins)); + var proteinPrunedProteins = proteinToConfidentBaseSequences.Keys + .Where(b => !b.IsDecoy && !b.IsContaminant) + .ToList(); + PrunedDatabaseWriter.WriteDatabase(outputXMLdbFullNameProteinPruned, proteinPrunedProteins, nestedIds); } + } - return databaseWritingTasks; + private void RestoreOriginalModifications(in Dictionary>> proteinsOriginalModifications, + in Dictionary>> originalSequenceVariantModifications) + { + foreach (var nonVariantProtein in Parameters.ProteinList.Select(p => p.NonVariantProtein).Distinct()) + { + if (nonVariantProtein.IsDecoy) continue; + nonVariantProtein.OneBasedPossibleLocalizedModifications.Clear(); + foreach (var originalMod in proteinsOriginalModifications[nonVariantProtein.NonVariantProtein]) + { + nonVariantProtein.OneBasedPossibleLocalizedModifications.Add(originalMod.Key, originalMod.Value); + } + foreach (var sv in nonVariantProtein.SequenceVariations) + { + sv.OneBasedModifications.Clear(); + foreach (var originalVariantMods in originalSequenceVariantModifications[sv]) + { + sv.OneBasedModifications.Add(originalVariantMods.Key, originalVariantMods.Value); + } + } + } } #endregion