From 4e95528b9c1b6015a6fc0249e96110356a546d2b Mon Sep 17 00:00:00 2001 From: Alexander-Sol <41119316+Alexander-Sol@users.noreply.github.com> Date: Mon, 5 Aug 2024 13:23:03 -0500 Subject: [PATCH] Changes to PEP Calculation and Filtering (#2387) * new style computation of pep q-value * fixed unit tests * separate PsmFdrInfo and PeptideFdrInfo calculations in FdrAnalysisEngine * d * fdh * not working yet * maybe better maybe not * huh * tr * j * 53 * Fixed filtering kinda * commit before i start breaking things * still sorta broken * idk * Fixed most issues, moved filtering to MetaMorpheus Task * fix multiprotease unit test * fix MakeSureFdrDoesntSkip * fix TestPeptideCount * new postsearchanalysistask results generator * fixed results output in postsearchanalysistask * yert * fix pep q-value calc * fix peptideFdrTest * fix spectral recovery * ity * fix semi specific test * fix metadraw test * lkah * poiu * slice test fixed * some tests * some testst * fixed most of silac unit tests * hmm * dsg * uio * kjg * ghk * Fixed the few remaining tests that were breaking * Five tests breaking, mostly numbers * Fixed results.txt writer for PEP-Q-values. * Fixed output bug * idk * broken * Finally fixed!!! * Added QValueThresholdForPEP to common params * Addressed Nic's comments * Fixed tests that broke when addressing Nic's comments * Made fields in FilteredPsms more explicit * added comments - MRS * More comments, better ordering --------- Co-authored-by: Michael Shortreed --- .../ClassicSearch/ClassicSearchEngine.cs | 2 +- MetaMorpheus/EngineLayer/CommonParameters.cs | 8 + .../FdrAnalysis/FdrAnalysisEngine.cs | 254 ++++-- .../FdrAnalysis/PEPValueAnalysisGeneric.cs | 15 +- .../NonSpecificEnzymeSearchEngine.cs | 14 +- .../ProteinParsimonyEngine.cs | 4 +- .../EngineLayer/PsmTsv/PsmTsvWriter.cs | 21 +- MetaMorpheus/EngineLayer/SpectralMatch.cs | 23 +- MetaMorpheus/TaskLayer/FilteredPsms.cs | 155 ++++ .../MbrAnalysis/SpectralRecoveryRunner.cs | 45 +- MetaMorpheus/TaskLayer/MetaMorpheusTask.cs | 9 +- MetaMorpheus/TaskLayer/MyTaskResults.cs | 10 +- .../SearchTask/PostSearchAnalysisTask.cs | 826 +++++++++--------- MetaMorpheus/Test/FdrTest.cs | 1 + .../Test/MultiProteaseParsimonyTest.cs | 27 +- MetaMorpheus/Test/MyTaskTest.cs | 45 +- .../Test/PostSearchAnalysisTaskTests.cs | 103 ++- MetaMorpheus/Test/SearchEngineTests.cs | 4 +- MetaMorpheus/Test/SearchTaskTest.cs | 32 +- MetaMorpheus/Test/SilacTest.cs | 3 +- MetaMorpheus/Test/SpectralRecoveryTest.cs | 25 +- MetaMorpheus/Test/TestDataFile.cs | 2 +- MetaMorpheus/Test/TestPsm.cs | 2 +- 23 files changed, 963 insertions(+), 667 deletions(-) create mode 100644 MetaMorpheus/TaskLayer/FilteredPsms.cs diff --git a/MetaMorpheus/EngineLayer/ClassicSearch/ClassicSearchEngine.cs b/MetaMorpheus/EngineLayer/ClassicSearch/ClassicSearchEngine.cs index ee15ab360..9cf82dd29 100644 --- a/MetaMorpheus/EngineLayer/ClassicSearch/ClassicSearchEngine.cs +++ b/MetaMorpheus/EngineLayer/ClassicSearch/ClassicSearchEngine.cs @@ -55,7 +55,7 @@ public ClassicSearchEngine(SpectralMatch[] globalPsms, Ms2ScanWithSpecificMass[] protected override MetaMorpheusEngineResults RunSpecific() { - Status("Getting ms2 scans..."); + Status("Getting ms2 scans..."); double proteinsSearched = 0; int oldPercentProgress = 0; diff --git a/MetaMorpheus/EngineLayer/CommonParameters.cs b/MetaMorpheus/EngineLayer/CommonParameters.cs index 25dc9370e..335749707 100644 --- a/MetaMorpheus/EngineLayer/CommonParameters.cs +++ b/MetaMorpheus/EngineLayer/CommonParameters.cs @@ -34,6 +34,7 @@ public CommonParameters( int totalPartitions = 1, double qValueThreshold = 0.01, double pepQValueThreshold = 1.0, + double qValueCutoffForPepCalculation = 0.005, double scoreCutoff = 5, int? numberOfPeaksToKeepPerWindow = 200, double? minimumAllowedIntensityRatioToBasePeak = 0.01, @@ -67,6 +68,7 @@ public CommonParameters( TotalPartitions = totalPartitions; QValueThreshold = qValueThreshold; PepQValueThreshold = pepQValueThreshold; + QValueCutoffForPepCalculation = qValueCutoffForPepCalculation; ScoreCutoff = scoreCutoff; NumberOfPeaksToKeepPerWindow = numberOfPeaksToKeepPerWindow; MinimumAllowedIntensityRatioToBasePeak = minimumAllowedIntensityRatioToBasePeak; @@ -157,6 +159,11 @@ public int DeconvolutionMaxAssumedChargeState /// public double PepQValueThreshold { get; private set; } public double ScoreCutoff { get; private set; } + /// + /// This parameter determines which PSMs/Peptides will be used as postive training examples + /// when training the GBDT model for PEP. + /// + public double QValueCutoffForPepCalculation { get; private set; } public DigestionParams DigestionParams { get; private set; } public bool ReportAllAmbiguity { get; private set; } public int? NumberOfPeaksToKeepPerWindow { get; private set; } @@ -225,6 +232,7 @@ public CommonParameters CloneWithNewTerminus(FragmentationTerminus? terminus = n TotalPartitions, QValueThreshold, PepQValueThreshold, + QValueCutoffForPepCalculation, ScoreCutoff, NumberOfPeaksToKeepPerWindow, MinimumAllowedIntensityRatioToBasePeak, diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs index bb5be3b5c..454bcebbc 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/FdrAnalysisEngine.cs @@ -1,6 +1,8 @@ using System; using System.Collections.Generic; +using System.IO; using System.Linq; +using System.Text.RegularExpressions; namespace EngineLayer.FdrAnalysis { @@ -8,23 +10,37 @@ public class FdrAnalysisEngine : MetaMorpheusEngine { private List AllPsms; private readonly int MassDiffAcceptorNumNotches; - private readonly double ScoreCutoff; private readonly string AnalysisType; private readonly string OutputFolder; // used for storing PEP training models private readonly bool DoPEP; public FdrAnalysisEngine(List psms, int massDiffAcceptorNumNotches, CommonParameters commonParameters, - List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, List nestedIds, string analysisType = "PSM", bool doPEP = true, string outputFolder = null) : base(commonParameters, fileSpecificParameters, nestedIds) + List<(string fileName, CommonParameters fileSpecificParameters)> fileSpecificParameters, List nestedIds, string analysisType = "PSM", + bool doPEP = true, string outputFolder = null) : base(commonParameters, fileSpecificParameters, nestedIds) { AllPsms = psms.OrderByDescending(p => p).ToList(); MassDiffAcceptorNumNotches = massDiffAcceptorNumNotches; - ScoreCutoff = commonParameters.ScoreCutoff; AnalysisType = analysisType; - this.OutputFolder = outputFolder; - this.DoPEP = doPEP; + OutputFolder = outputFolder; + DoPEP = doPEP; + if (AllPsms.Any()) + AddPsmAndPeptideFdrInfoIfNotPresent(); if (fileSpecificParameters == null) throw new ArgumentNullException("file specific parameters cannot be null"); } + private void AddPsmAndPeptideFdrInfoIfNotPresent() + { + foreach (var psm in AllPsms.Where(p=> p.PsmFdrInfo == null)) + { + psm.PsmFdrInfo = new FdrInfo(); + } + + foreach (var psm in AllPsms.Where(p => p.PeptideFdrInfo == null)) + { + psm.PeptideFdrInfo = new FdrInfo(); + } + } + protected override MetaMorpheusEngineResults RunSpecific() { FdrAnalysisResults myAnalysisResults = new FdrAnalysisResults(this, AnalysisType); @@ -47,49 +63,89 @@ private void DoFalseDiscoveryRateAnalysis(FdrAnalysisResults myAnalysisResults) foreach (var proteasePsms in psmsGroupedByProtease) { - var psms = proteasePsms.ToList(); - - QValueTraditional(psms); - if (psms.Count > 100) + var psms = proteasePsms.OrderByDescending(p => p).ToList(); + var peptides = psms + .OrderByDescending(p => p) + .GroupBy(b => b.FullSequence) + .Select(b => b.FirstOrDefault()) + .ToList(); + + if (psms.Count > 100 & DoPEP) { - if (DoPEP) + CalculateQValue(psms, peptideLevelCalculation: false, pepCalculation: false); + if (peptides.Count > 100 ) + { + CalculateQValue(peptides, peptideLevelCalculation: true, pepCalculation: false); + + //PEP will model will be developed using peptides and then applied to all PSMs. + Compute_PEPValue(myAnalysisResults, psms); + + // peptides are ordered by MM score from good to bad in order to select the best PSM for each peptide + peptides = psms + .OrderByDescending(p => p) + .GroupBy(p => p.FullSequence) + .Select(p => p.FirstOrDefault()) + .OrderBy(p => p.FdrInfo.PEP) // Then order by PEP (PSM PEP and Peptide PEP are the same) + .ThenByDescending(p => p) + .ToList(); + CalculateQValue(peptides, peptideLevelCalculation: true, pepCalculation: true); + + psms = psms.OrderBy(p => p.PsmFdrInfo.PEP).ThenByDescending(p => p).ToList(); + CalculateQValue(psms, peptideLevelCalculation: false, pepCalculation: true); + + } + else //we have more than 100 psms but less than 100 peptides so { - Compute_PEPValue(myAnalysisResults); + //this will be done using PSMs because we dont' have enough peptides + Compute_PEPValue(myAnalysisResults, psms); + psms = psms.OrderBy(p => p.PsmFdrInfo.PEP).ThenByDescending(p => p).ToList(); + CalculateQValue(psms, peptideLevelCalculation: false, pepCalculation: true); } - Compute_PEPValue_Based_QValue(psms); - QValueInverted(psms); } - CountPsm(psms); - } - } - - private static void QValueInverted(List psms) - { - psms.Reverse(); - //this calculation is performed from bottom up. So, we begin the loop by computing qValue - //and qValueNotch for the last/lowest scoring psm in the bunch - double qValue = (psms[0].FdrInfo.CumulativeDecoy + 1) / psms[0].FdrInfo.CumulativeTarget; - double qValueNotch = (psms[0].FdrInfo.CumulativeDecoyNotch + 1) / psms[0].FdrInfo.CumulativeTargetNotch; - - //Assign FDR values to PSMs - for (int i = 0; i < psms.Count; i++) - { - // Stop if canceled - if (GlobalVariables.StopLoops) { break; } - - qValue = Math.Min(qValue, (psms[i].FdrInfo.CumulativeDecoy + 1) / psms[i].FdrInfo.CumulativeTarget); - qValueNotch = Math.Min(qValueNotch, (psms[i].FdrInfo.CumulativeDecoyNotch + 1) / psms[i].FdrInfo.CumulativeTargetNotch); - - double pep = psms[i].FdrInfo == null ? double.NaN : psms[i].FdrInfo.PEP; - double pepQValue = psms[i].FdrInfo == null ? double.NaN : psms[i].FdrInfo.PEP_QValue; + else if(psms.Any(psm => psm.FdrInfo.PEP > 0)) + { + // If PEP's have been calculated, but doPEP = false, then we don't want to train another model, + // but we do want to calculate pep q-values + // really, in this case, we only need to run one or the other (i.e., only peptides or psms are passed in) + // but there's no mechanism to pass that info to the FDR analysis engine, so we'll do this for now + peptides = psms + .OrderByDescending(p => p) + .GroupBy(p => p.FullSequence) + .Select(p => p.FirstOrDefault()) // Get the best psm for each peptide based on MBR score + .OrderBy(p => p.FdrInfo.PEP) // Then order by PEP (PSM PEP and Peptide PEP are the same) + .ThenByDescending(p => p) + .ToList(); + CalculateQValue(peptides, peptideLevelCalculation: true, pepCalculation: true); + + psms = psms + .OrderBy(p => p.PsmFdrInfo.PEP) + .ThenByDescending(p => p) + .ToList(); + CalculateQValue(psms, peptideLevelCalculation: false, pepCalculation: true); - psms[i].SetQandPEPvalues(qValue, qValueNotch, pep, pepQValue); + } + //we do this section last so that target and decoy counts written in the psmtsv files are appropriate for the sort order which is by MM score + peptides = psms + .OrderByDescending(p => p) + .GroupBy(b => b.FullSequence) + .Select(b => b.FirstOrDefault()) + .ToList(); + CalculateQValue(peptides, peptideLevelCalculation: true, pepCalculation: false); + + psms = psms.OrderByDescending(p => p).ToList(); + CalculateQValue(psms, peptideLevelCalculation: false, pepCalculation: false); + + CountPsm(psms); } - psms.Reverse(); //we inverted the psms for this calculation. now we need to put them back into the original order } - private void QValueTraditional(List psms) + /// + /// This methods assumes that PSMs are already sorted appropriately for downstream usage + /// Then, it counts the number of targets and (fractional) decoys, writes those values to the + /// appropriate FdrInfo (PSM or Peptide level), and calculates q-values + /// + public void CalculateQValue(List psms, bool peptideLevelCalculation, bool pepCalculation = false) { double cumulativeTarget = 0; double cumulativeDecoy = 0; @@ -99,16 +155,14 @@ private void QValueTraditional(List psms) double[] cumulativeDecoyPerNotch = new double[MassDiffAcceptorNumNotches + 1]; //Assign FDR values to PSMs - for (int i = 0; i < psms.Count; i++) + foreach (var psm in psms) { // Stop if canceled if (GlobalVariables.StopLoops) { break; } - SpectralMatch psm = psms[i]; int notch = psm.Notch ?? MassDiffAcceptorNumNotches; if (psm.IsDecoy) { - // the PSM can be ambiguous between a target and a decoy sequence // in that case, count it as the fraction of decoy hits // e.g. if the PSM matched to 1 target and 2 decoys, it counts as 2/3 decoy double decoyHits = 0; @@ -122,7 +176,6 @@ private void QValueTraditional(List psms) } totalHits++; } - cumulativeDecoy += decoyHits / totalHits; cumulativeDecoyPerNotch[notch] += decoyHits / totalHits; } @@ -132,61 +185,110 @@ private void QValueTraditional(List psms) cumulativeTargetPerNotch[notch]++; } - double qValue = Math.Min(1, cumulativeDecoy / cumulativeTarget); - double qValueNotch = Math.Min(1, cumulativeDecoyPerNotch[notch] / cumulativeTargetPerNotch[notch]); + psm.GetFdrInfo(peptideLevelCalculation).CumulativeDecoy = cumulativeDecoy; + psm.GetFdrInfo(peptideLevelCalculation).CumulativeTarget = cumulativeTarget; + psm.GetFdrInfo(peptideLevelCalculation).CumulativeDecoyNotch = cumulativeDecoyPerNotch[notch]; + psm.GetFdrInfo(peptideLevelCalculation).CumulativeTargetNotch = cumulativeTargetPerNotch[notch]; - double pep = psm.FdrInfo == null ? double.NaN : psm.FdrInfo.PEP; - double pepQValue = psm.FdrInfo == null ? double.NaN : psm.FdrInfo.PEP_QValue; + } + + if (pepCalculation) + { + PepQValueInverted(psms, peptideLevelAnalysis: peptideLevelCalculation); + } + else + { + if(psms.Count < 100) + { - psm.SetFdrValues(cumulativeTarget, cumulativeDecoy, qValue, cumulativeTargetPerNotch[notch], cumulativeDecoyPerNotch[notch], qValueNotch, pep, pepQValue); + QValueTraditional(psms, peptideLevelAnalysis: peptideLevelCalculation); + } + else + { + QValueInverted(psms, peptideLevelAnalysis: peptideLevelCalculation); + } } } - public void Compute_PEPValue(FdrAnalysisResults myAnalysisResults) + /// + /// This method is used only to calculate q-values for total PSM counts below 100 + /// + private void QValueTraditional(List psms, bool peptideLevelAnalysis) { - if (AnalysisType == "PSM") + double qValue = 0; + double qValueNotch = 0; + for (int i = 0; i < psms.Count; i++) { - //Need some reasonable number of PSMs to train on to get a reasonable estimation of the PEP - if (AllPsms.Count > 100) - { - string searchType = "standard"; - if (AllPsms[0].DigestionParams.Protease.Name == "top-down") - { - searchType = "top-down"; - } + // Stop if canceled + if (GlobalVariables.StopLoops) { break; } - myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(AllPsms, searchType, this.FileSpecificParameters, this.OutputFolder); + qValue = Math.Max(qValue, psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeDecoy / Math.Max(psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeTarget, 1)); + qValueNotch = Math.Max(qValueNotch, psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeDecoyNotch / Math.Max(psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeTargetNotch, 1)); - Compute_PEPValue_Based_QValue(AllPsms); - } + psms[i].GetFdrInfo(peptideLevelAnalysis).QValue = Math.Min(qValue, 1); + psms[i].GetFdrInfo(peptideLevelAnalysis).QValueNotch = Math.Min(qValueNotch, 1); } + } + + private static void QValueInverted(List psms, bool peptideLevelAnalysis) + { + psms.Reverse(); + //this calculation is performed from bottom up. So, we begin the loop by computing qValue + //and qValueNotch for the last/lowest scoring psm in the bunch + double qValue = (psms[0].GetFdrInfo(peptideLevelAnalysis).CumulativeDecoy + 1) / psms[0].GetFdrInfo(peptideLevelAnalysis).CumulativeTarget; + double qValueNotch = (psms[0].GetFdrInfo(peptideLevelAnalysis).CumulativeDecoyNotch + 1) / psms[0].GetFdrInfo(peptideLevelAnalysis).CumulativeTargetNotch; - if (AnalysisType == "Peptide") + //Assign FDR values to PSMs + for (int i = 0; i < psms.Count; i++) { - Compute_PEPValue_Based_QValue(AllPsms); + // Stop if canceled + if (GlobalVariables.StopLoops) { break; } + + qValue = Math.Min(qValue, (psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeDecoy + 1) / Math.Max(psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeTarget, 1)); + qValueNotch = Math.Min(qValueNotch, (psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeDecoyNotch + 1) / Math.Max(psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeTargetNotch, 1)); + + psms[i].GetFdrInfo(peptideLevelAnalysis).QValue = Math.Min(qValue, 1); + psms[i].GetFdrInfo(peptideLevelAnalysis).QValueNotch = Math.Min(qValueNotch, 1); } + psms.Reverse(); //we inverted the psms for this calculation. now we need to put them back into the original order + } - if (AnalysisType == "crosslink" && AllPsms.Count > 100) + public static void PepQValueInverted(List psms, bool peptideLevelAnalysis) + { + psms.Reverse(); + //this calculation is performed from bottom up. So, we begin the loop by computing qValue + //and qValueNotch for the last/lowest scoring psm in the bunch + double qValue = (psms[0].GetFdrInfo(peptideLevelAnalysis).CumulativeDecoy + 1) / psms[0].GetFdrInfo(peptideLevelAnalysis).CumulativeTarget; + + //Assign FDR values to PSMs + for (int i = 0; i < psms.Count; i++) { - myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(AllPsms, "crosslink", this.FileSpecificParameters, this.OutputFolder); - Compute_PEPValue_Based_QValue(AllPsms); + // Stop if canceled + if (GlobalVariables.StopLoops) { break; } + + qValue = Math.Min(qValue, (psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeDecoy + 1) / psms[i].GetFdrInfo(peptideLevelAnalysis).CumulativeTarget); + + psms[i].GetFdrInfo(peptideLevelAnalysis).PEP_QValue = qValue; } + psms.Reverse(); //we inverted the psms for this calculation. now we need to put them back into the original order } - public static void Compute_PEPValue_Based_QValue(List psms) + public void Compute_PEPValue(FdrAnalysisResults myAnalysisResults, List psms) { - double[] allPEPValues = psms.Select(p => p.FdrInfo.PEP).ToArray(); - int[] psmsArrayIndicies = Enumerable.Range(0, psms.Count).ToArray(); - Array.Sort(allPEPValues, psmsArrayIndicies);//sort the second thing by the first - - double runningSum = 0; - for (int i = 0; i < allPEPValues.Length; i++) + if (psms[0].DigestionParams.Protease.Name == "top-down") { - runningSum += allPEPValues[i]; - double qValue = runningSum / (i + 1); - psms[psmsArrayIndicies[i]].FdrInfo.PEP_QValue = Math.Round(qValue, 6); + myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "top-down", this.FileSpecificParameters, this.OutputFolder); + } + else if (psms[0].DigestionParams.Protease.Name == "crosslink") + { + myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "crosslink", this.FileSpecificParameters, this.OutputFolder); + } + else + { + myAnalysisResults.BinarySearchTreeMetrics = PEP_Analysis_Cross_Validation.ComputePEPValuesForAllPSMsGeneric(psms, "standard", this.FileSpecificParameters, this.OutputFolder); } } + /// /// This method gets the count of PSMs with the same full sequence (with q-value < 0.01) to include in the psmtsv output /// diff --git a/MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs b/MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs index 56230f449..2fa22248f 100644 --- a/MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs +++ b/MetaMorpheus/EngineLayer/FdrAnalysis/PEPValueAnalysisGeneric.cs @@ -30,7 +30,9 @@ public static class PEP_Analysis_Cross_Validation /// A dictionary which stores the chimeric ID string in the key and the number of chimeric identifications as the vale /// private static Dictionary chimeraCountDictionary = new Dictionary(); - + public static bool UsePeptideLevelQValueForTraining = true; + public static double QValueCutoff = 0.005; + /// /// This method is used to compute the PEP values for all PSMs in a dataset. @@ -52,6 +54,8 @@ public static string ComputePEPValuesForAllPSMsGeneric(List psms, .Select(b => b.FirstOrDefault()).ToList(); List countOfPeptidesInEachFile = peptides.GroupBy(b => b.FullFilePath).Select(b => b.Count()).ToList(); bool allFilesContainPeptides = (countOfPeptidesInEachFile.Count == fileSpecificParameters.Count); //rare condition where each file has psms but some files don't have peptides. probably only happens in unit tests. + UsePeptideLevelQValueForTraining = true; + QValueCutoff = fileSpecificParameters.Select(t => t.fileSpecificParameters.QValueCutoffForPepCalculation).Min(); int chargeStateMode = 0; Dictionary fileSpecificMedianFragmentMassErrors = new Dictionary(); @@ -67,10 +71,12 @@ public static string ComputePEPValuesForAllPSMsGeneric(List psms, else { //there are too few psms to do any meaningful training if we used only peptides. So, we will train using psms instead. + UsePeptideLevelQValueForTraining = false; allPeptideIndices = Enumerable.Range(0, psms.Count).ToList(); chargeStateMode = GetChargeStateMode(psms); fileSpecificMedianFragmentMassErrors = GetFileSpecificMedianFragmentMassError(psms); } + //These two dictionaries contain the average and standard deviations of hydrophobicitys measured in 1 minute increments accross each raw //file separately. An individully measured hydrobophicty calculated for a specific PSM sequence is compared to these values by computing @@ -378,7 +384,8 @@ public static void RemoveBestMatchingPeptidesWithLowPEP(SpectralMatch psm, List< psm.RemoveThisAmbiguousPeptide(notches[i], pwsmList[i]); ambiguousPeptidesRemovedCount++; } - psm.FdrInfo.PEP = 1 - pepValuePredictions.Max(); + psm.PsmFdrInfo.PEP = 1 - pepValuePredictions.Max(); + psm.PeptideFdrInfo.PEP = 1 - pepValuePredictions.Max(); } /// @@ -712,7 +719,7 @@ public static IEnumerable CreatePsmData(string searchType, List<(string label = false; newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); } - else if (!csm.IsDecoy && !csm.BetaPeptide.IsDecoy && psm.FdrInfo.QValue <= 0.005) + else if (!csm.IsDecoy && !csm.BetaPeptide.IsDecoy && psm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) { label = true; newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, csm.BestMatchingBioPolymersWithSetMods.First().Peptide, 0, label); @@ -736,7 +743,7 @@ public static IEnumerable CreatePsmData(string searchType, List<(string label = false; newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, peptideWithSetMods, notch, label); } - else if (!peptideWithSetMods.Parent.IsDecoy && psm.FdrInfo.QValue <= 0.005) + else if (!peptideWithSetMods.Parent.IsDecoy && psm.GetFdrInfo(UsePeptideLevelQValueForTraining).QValue <= QValueCutoff) { label = true; newPsmData = CreateOnePsmDataEntry(searchType, fileSpecificParameters, psm, timeDependantHydrophobicityAverageAndDeviation_unmodified, timeDependantHydrophobicityAverageAndDeviation_modified, fileSpecificMedianFragmentMassErrors, chargeStateMode, peptideWithSetMods, notch, label); diff --git a/MetaMorpheus/EngineLayer/NonSpecificEnzymeSearch/NonSpecificEnzymeSearchEngine.cs b/MetaMorpheus/EngineLayer/NonSpecificEnzymeSearch/NonSpecificEnzymeSearchEngine.cs index 1426bd4aa..4c23fa7d2 100644 --- a/MetaMorpheus/EngineLayer/NonSpecificEnzymeSearch/NonSpecificEnzymeSearchEngine.cs +++ b/MetaMorpheus/EngineLayer/NonSpecificEnzymeSearch/NonSpecificEnzymeSearchEngine.cs @@ -475,7 +475,7 @@ public static List ResolveFdrCategorySpecificPsms(List x != null).Count(x => x.FdrInfo.QValue <= 0.01); //set ranking as number of psms above 1% FDR + ranking[i] = AllPsms[i].Where(x => x != null).Count(x => x.PsmFdrInfo.QValue <= 0.01); //set ranking as number of psms above 1% FDR indexesOfInterest.Add(i); } } @@ -515,9 +515,9 @@ public static List ResolveFdrCategorySpecificPsms(List minorPsm.FdrInfo.QValue) + if (majorPsm.PsmFdrInfo.QValue > minorPsm.PsmFdrInfo.QValue) { - minorPsm.FdrInfo.QValue = majorPsm.FdrInfo.QValue; + minorPsm.PsmFdrInfo.QValue = majorPsm.PsmFdrInfo.QValue; } minorPsmIndex++; } @@ -527,9 +527,9 @@ public static List ResolveFdrCategorySpecificPsms(List minorPsm.FdrInfo.QValue) + if (majorPsm.PsmFdrInfo.QValue > minorPsm.PsmFdrInfo.QValue) { - minorPsm.FdrInfo.QValue = majorPsm.FdrInfo.QValue; + minorPsm.PsmFdrInfo.QValue = majorPsm.PsmFdrInfo.QValue; } minorPsmIndex++; } @@ -548,7 +548,7 @@ public static List ResolveFdrCategorySpecificPsms(List bestPsm.Score)) { @@ -587,7 +587,7 @@ public static List ResolveFdrCategorySpecificPsms(List b.FdrInfo.QValue).ThenByDescending(b => b.Score).ToList(); + return bestPsmsList.OrderBy(b => b.PsmFdrInfo.QValue).ThenByDescending(b => b.Score).ToList(); } public static List GetVariableTerminalMods(FragmentationTerminus fragmentationTerminus, List variableModifications) diff --git a/MetaMorpheus/EngineLayer/ProteinParsimony/ProteinParsimonyEngine.cs b/MetaMorpheus/EngineLayer/ProteinParsimony/ProteinParsimonyEngine.cs index d57cc666b..081aab800 100644 --- a/MetaMorpheus/EngineLayer/ProteinParsimony/ProteinParsimonyEngine.cs +++ b/MetaMorpheus/EngineLayer/ProteinParsimony/ProteinParsimonyEngine.cs @@ -41,11 +41,11 @@ public ProteinParsimonyEngine(List allPsms, bool modPeptidesAreDi // KEEP contaminants for use in parsimony! if (modPeptidesAreDifferent) { - _fdrFilteredPsms = allPsms.Where(p => p.FullSequence != null && p.FdrInfo.QValue <= FdrCutoffForParsimony && p.FdrInfo.QValueNotch <= FdrCutoffForParsimony).ToList(); + _fdrFilteredPsms = allPsms.Where(p => p.FullSequence != null && p.PsmFdrInfo.QValue <= FdrCutoffForParsimony && p.PsmFdrInfo.QValueNotch <= FdrCutoffForParsimony).ToList(); } else { - _fdrFilteredPsms = allPsms.Where(p => p.BaseSequence != null && p.FdrInfo.QValue <= FdrCutoffForParsimony && p.FdrInfo.QValueNotch <= FdrCutoffForParsimony).ToList(); + _fdrFilteredPsms = allPsms.Where(p => p.BaseSequence != null && p.PsmFdrInfo.QValue <= FdrCutoffForParsimony && p.PsmFdrInfo.QValueNotch <= FdrCutoffForParsimony).ToList(); } // peptides to use in parsimony = peptides observed in high-confidence PSMs (including decoys) diff --git a/MetaMorpheus/EngineLayer/PsmTsv/PsmTsvWriter.cs b/MetaMorpheus/EngineLayer/PsmTsv/PsmTsvWriter.cs index 89404391e..4c31717e9 100644 --- a/MetaMorpheus/EngineLayer/PsmTsv/PsmTsvWriter.cs +++ b/MetaMorpheus/EngineLayer/PsmTsv/PsmTsvWriter.cs @@ -316,7 +316,7 @@ internal static void AddMatchedIonsData(Dictionary s, List s, SpectralMatch peptide) + internal static void AddMatchScoreData(Dictionary s, SpectralMatch peptide, bool writePeptideLevelFdr = false) { string spectralAngle = peptide == null ? " " : peptide.SpectralAngle.ToString("F4"); string localizedScores = " "; @@ -339,17 +339,18 @@ internal static void AddMatchScoreData(Dictionary s, SpectralMat string PEP = " "; string PEP_Qvalue = " "; - if (peptide != null && peptide.FdrInfo != null) + if (peptide != null && peptide.GetFdrInfo(writePeptideLevelFdr) != null) { - cumulativeTarget = peptide.FdrInfo.CumulativeTarget.ToString(CultureInfo.InvariantCulture); - cumulativeDecoy = peptide.FdrInfo.CumulativeDecoy.ToString(CultureInfo.InvariantCulture); - qValue = peptide.FdrInfo.QValue.ToString("F6", CultureInfo.InvariantCulture); - cumulativeTargetNotch = peptide.FdrInfo.CumulativeTargetNotch.ToString(CultureInfo.InvariantCulture); - cumulativeDecoyNotch = peptide.FdrInfo.CumulativeDecoyNotch.ToString(CultureInfo.InvariantCulture); - qValueNotch = peptide.FdrInfo.QValueNotch.ToString("F6", CultureInfo.InvariantCulture); - PEP = peptide.FdrInfo.PEP.ToString(); - PEP_Qvalue = peptide.FdrInfo.PEP_QValue.ToString(); + cumulativeTarget = peptide.GetFdrInfo(writePeptideLevelFdr).CumulativeTarget.ToString(CultureInfo.InvariantCulture); + cumulativeDecoy = peptide.GetFdrInfo(writePeptideLevelFdr).CumulativeDecoy.ToString(CultureInfo.InvariantCulture); + qValue = peptide.GetFdrInfo(writePeptideLevelFdr).QValue.ToString("F6", CultureInfo.InvariantCulture); + cumulativeTargetNotch = peptide.GetFdrInfo(writePeptideLevelFdr).CumulativeTargetNotch.ToString(CultureInfo.InvariantCulture); + cumulativeDecoyNotch = peptide.GetFdrInfo(writePeptideLevelFdr).CumulativeDecoyNotch.ToString(CultureInfo.InvariantCulture); + qValueNotch = peptide.GetFdrInfo(writePeptideLevelFdr).QValueNotch.ToString("F6", CultureInfo.InvariantCulture); + PEP = peptide.GetFdrInfo(writePeptideLevelFdr).PEP.ToString(); + PEP_Qvalue = peptide.GetFdrInfo(writePeptideLevelFdr).PEP_QValue.ToString(); } + s[PsmTsvHeader.CumulativeTarget] = cumulativeTarget; s[PsmTsvHeader.CumulativeDecoy] = cumulativeDecoy; s[PsmTsvHeader.QValue] = qValue; diff --git a/MetaMorpheus/EngineLayer/SpectralMatch.cs b/MetaMorpheus/EngineLayer/SpectralMatch.cs index 1e4caa45a..96ef0f644 100644 --- a/MetaMorpheus/EngineLayer/SpectralMatch.cs +++ b/MetaMorpheus/EngineLayer/SpectralMatch.cs @@ -73,7 +73,20 @@ protected SpectralMatch(IBioPolymerWithSetMods peptide, int notch, double score, public string FullFilePath { get; private set; } public int ScanIndex { get; } public int NumDifferentMatchingPeptides { get { return _BestMatchingBioPolymersWithSetMods.Count; } } - public FdrInfo FdrInfo { get; private set; } + + public FdrInfo FdrInfo + { + get => PsmFdrInfo; + set => PsmFdrInfo = value; + + } + public FdrInfo PsmFdrInfo { get; set; } + public FdrInfo PeptideFdrInfo { get; set; } + public FdrInfo GetFdrInfo(bool peptideLevel) + { + return peptideLevel ? PeptideFdrInfo : PsmFdrInfo; + } + public PsmData PsmData_forPEPandPercolator { get; set; } public double Score { get; private set; } @@ -266,18 +279,18 @@ public override string ToString() return ToString(new Dictionary()); } - public string ToString(IReadOnlyDictionary ModstoWritePruned) + public string ToString(IReadOnlyDictionary ModstoWritePruned, bool writePeptideLevelFdr = false) { - return string.Join("\t", DataDictionary(this, ModstoWritePruned).Values); + return string.Join("\t", DataDictionary(this, ModstoWritePruned, writePeptideLevelFdr).Values); } - public static Dictionary DataDictionary(SpectralMatch psm, IReadOnlyDictionary ModsToWritePruned) + public static Dictionary DataDictionary(SpectralMatch psm, IReadOnlyDictionary ModsToWritePruned, bool writePeptideLevelFdr = false) { Dictionary s = new Dictionary(); PsmTsvWriter.AddBasicMatchData(s, psm); PsmTsvWriter.AddPeptideSequenceData(s, psm, ModsToWritePruned); PsmTsvWriter.AddMatchedIonsData(s, psm?.MatchedFragmentIons); - PsmTsvWriter.AddMatchScoreData(s, psm); + PsmTsvWriter.AddMatchScoreData(s, psm, writePeptideLevelFdr); return s; } diff --git a/MetaMorpheus/TaskLayer/FilteredPsms.cs b/MetaMorpheus/TaskLayer/FilteredPsms.cs new file mode 100644 index 000000000..869a477a5 --- /dev/null +++ b/MetaMorpheus/TaskLayer/FilteredPsms.cs @@ -0,0 +1,155 @@ +using Easy.Common.Extensions; +using EngineLayer; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace TaskLayer +{ + /// + /// Contains a filtered list of PSMs. + /// All properties within this class are read-only, and should only be set on object construction + /// + public class FilteredPsms : IEnumerable + { + public List FilteredPsmsList { get; private set; } + /// + /// Filter type can have only two values: "q-value" or "pep q-value" + /// + public string FilterType { get; init; } + public double FilterThreshold { get; init; } + public bool FilteringNotPerformed { get; init; } + public bool PeptideLevelFiltering { get; init; } + public FilteredPsms(List filteredPsms, string filterType, double filterThreshold, bool filteringNotPerformed, bool peptideLevelFiltering) + { + FilteredPsmsList = filteredPsms; + FilterType = filterType; + FilterThreshold = filterThreshold; + FilteringNotPerformed = filteringNotPerformed; + PeptideLevelFiltering = peptideLevelFiltering; + } + + private bool AboveThreshold(SpectralMatch psm) + { + if (psm.GetFdrInfo(PeptideLevelFiltering) == null) return false; + + switch (FilterType) + { + case "pep q-value": + return psm.GetFdrInfo(PeptideLevelFiltering).PEP_QValue <= FilterThreshold; + default: + return psm.GetFdrInfo(PeptideLevelFiltering).QValue <= FilterThreshold && psm.GetFdrInfo(PeptideLevelFiltering).QValueNotch <= FilterThreshold; + } + } + + /// + /// This method should only be called when filtered PSMs are modified for the purpose of SILAC analysis + /// + /// + public void SetSilacFilteredPsms(List silacPsms) + { + FilteredPsmsList = silacPsms; + } + + /// + /// Returns the number of PSMs that passed the filtering criteria + /// + public int TargetPsmsAboveThreshold => FilteredPsmsList.Count(psm => !psm.IsDecoy && !psm.IsContaminant && AboveThreshold(psm)); + + /// + /// Returns a FilteredPsms object that holds every psm that passed the filtering criteria. + /// Q-Value and PEP Q-Value thresholds are read from common parameters by default, but can be overridden + /// Q-Value and PEP Q-Value filtering are mutually exculsive. + /// In cases where PEP filtering was selected but PEP wasn't performed due to insufficient PSMs, + /// filtering defaults to Q and Q_Notch. + /// + /// List of spectral match objects to be filtered + /// Filter results at the peptide level (defaults to false) + /// A FilteredPsms object + public static FilteredPsms Filter(IEnumerable psms, + CommonParameters commonParams, + bool includeDecoys = true, + bool includeContaminants = true, + bool includeAmbiguous = false, + bool includeAmbiguousMods = true, + bool includeHighQValuePsms = false, + double? qValueThreshold = null, + double? pepQValueThreshold = null, + bool filterAtPeptideLevel = false) + { + + qValueThreshold ??= commonParams.QValueThreshold; + pepQValueThreshold ??= commonParams.PepQValueThreshold; + double filterThreshold = Math.Min((double)qValueThreshold, (double)pepQValueThreshold); + bool filteringNotPerformed = false; + List filteredPsms = new List(); + + // set the filter type + string filterType = "q-value"; + if (pepQValueThreshold < qValueThreshold) + { + if (psms.Count() < 100) + { + filteringNotPerformed = true; + filterThreshold = 1; + } + else + { + filterType = "pep q-value"; + } + } + + if (!includeHighQValuePsms) + { + filteredPsms = filterType.Equals("q-value") + ? psms.Where(p => p.GetFdrInfo(filterAtPeptideLevel) != null + && p.GetFdrInfo(filterAtPeptideLevel).QValue <= filterThreshold + && p.GetFdrInfo(filterAtPeptideLevel).QValueNotch <= filterThreshold).ToList() + : psms.Where(p => p.GetFdrInfo(filterAtPeptideLevel) != null && p.GetFdrInfo(filterAtPeptideLevel).PEP_QValue <= filterThreshold).ToList(); + } + else + { + filteredPsms = psms.ToList(); + } + + if (!includeDecoys) + { + filteredPsms.RemoveAll(p => p.IsDecoy); + } + if (!includeContaminants) + { + filteredPsms.RemoveAll(p => p.IsContaminant); + } + if (!includeAmbiguous) + { + filteredPsms.RemoveAll(p => p.BaseSequence.IsNullOrEmpty()); + } + if (!includeAmbiguousMods) + { + filteredPsms.RemoveAll(p => p.FullSequence.IsNullOrEmpty()); + } + if (filterAtPeptideLevel) + { + //Choose the top scoring PSM for each peptide + filteredPsms = filteredPsms + .OrderByDescending(p => p) + .GroupBy(b => b.FullSequence) + .Select(b => b.FirstOrDefault()).ToList(); + } + + return new FilteredPsms(filteredPsms, filterType, filterThreshold, filteringNotPerformed, filterAtPeptideLevel); + } + + public IEnumerator GetEnumerator() + { + return FilteredPsmsList.GetEnumerator(); + } + + System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() + { + return FilteredPsmsList.GetEnumerator(); + } + } +} diff --git a/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs b/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs index 34e5a591d..a963eefbf 100644 --- a/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs +++ b/MetaMorpheus/TaskLayer/MbrAnalysis/SpectralRecoveryRunner.cs @@ -117,12 +117,7 @@ public static SpectralRecoveryResults RunSpectralRecoveryAlgorithm( if (bestMbrMatches.Any()) { List allPsms = parameters.AllPsms. - OrderByDescending(p => p.Score). - ThenBy(p => p.FdrInfo.QValue). - ThenBy(p => p.FullFilePath). - ThenBy(x => x.ScanNumber). - ThenBy(p => p.FullSequence). - ThenBy(p => p.Accession).ToList(); + OrderByDescending(p => p).ToList(); AssignEstimatedPsmQvalue(bestMbrMatches, allPsms); FDRAnalysisOfMbrPsms(bestMbrMatches, allPsms, parameters, fileSpecificParameters); @@ -141,29 +136,20 @@ private static List GetAllPeptides( CommonParameters commonParameters, List<(string, CommonParameters)> fileSpecificParameters) { - List peptides = new(); - peptides = parameters.AllPsms.Where(b => b.FullSequence != null).GroupBy(b => b.FullSequence).Select(b => b.FirstOrDefault()).ToList(); - - new FdrAnalysisEngine(peptides, parameters.NumNotches, commonParameters, fileSpecificParameters, new List { parameters.SearchTaskId }, "Peptide").Run(); - - if (!parameters.SearchParameters.WriteDecoys) - { - peptides.RemoveAll(b => b.IsDecoy); - } - if (!parameters.SearchParameters.WriteContaminants) - { - peptides.RemoveAll(b => b.IsContaminant); - } - - double qValueCutoff = 0.01; - if (parameters.AllPsms.Count > 100)//PEP is not computed when there are fewer than 100 psms + var peptides = parameters.AllPsms; + PostSearchAnalysisTask postProcessing = new PostSearchAnalysisTask { - peptides.RemoveAll(p => p.FdrInfo.PEP_QValue > qValueCutoff); - } - else - { - peptides.RemoveAll(p => p.FdrInfo.QValue > qValueCutoff); - } + Parameters = parameters, + FileSpecificParameters = fileSpecificParameters, + CommonParameters = commonParameters + }; + + FilteredPsms.Filter(peptides, + commonParameters, + includeDecoys: false, + includeContaminants: false, + includeAmbiguous: false, + includeHighQValuePsms: false); return peptides; } @@ -178,6 +164,7 @@ private static SpectralMatch BestPsmForMbrPeak(IEnumerable peptid foreach (SpectralMatch psm in nonNullPsms) { psm.SetFdrValues(0, 0, 0, 0, 0, 0, 0, 0); + psm.PeptideFdrInfo = psm.PsmFdrInfo; } if (nonNullPsms.Select(p => p.SpectralAngle).Any(g => g != double.NaN)) { @@ -191,7 +178,7 @@ private static SpectralMatch BestPsmForMbrPeak(IEnumerable peptid private static void AssignEstimatedPsmQvalue(ConcurrentDictionary bestMbrMatches, List allPsms) { double[] allScores = allPsms.Select(s => s.Score).OrderByDescending(s => s).ToArray(); - double[] allQValues = allPsms.OrderByDescending(s => s.Score).Select(q => q.FdrInfo.QValue).ToArray(); + double[] allQValues = allPsms.OrderByDescending(s => s.Score).Select(q => q.PsmFdrInfo.QValue).ToArray(); foreach (SpectralRecoveryPSM match in bestMbrMatches.Values) { diff --git a/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs b/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs index cbc70c44b..cbc145fcf 100644 --- a/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs +++ b/MetaMorpheus/TaskLayer/MetaMorpheusTask.cs @@ -20,6 +20,7 @@ using Omics.SpectrumMatch; using SpectralAveraging; using UsefulProteomicsDatabases; +using Easy.Common.Extensions; namespace TaskLayer { @@ -551,7 +552,7 @@ public MyTaskResults RunTask(string output_folder, List currentProtei { MetaMorpheusEngine.FinishedSingleEngineHandler -= SingleEngineHandlerInTask; var resultsFileName = Path.Combine(output_folder, "results.txt"); - e.Data.Add("folder", output_folder); + e.Data.Add("folder", output_folder); using (StreamWriter file = new StreamWriter(resultsFileName)) { file.WriteLine(GlobalVariables.MetaMorpheusVersion.Equals("1.0.0.0") ? "MetaMorpheus: Not a release version" : "MetaMorpheus: version " + GlobalVariables.MetaMorpheusVersion); @@ -566,7 +567,7 @@ public MyTaskResults RunTask(string output_folder, List currentProtei throw; } - { +{ var proseFilePath = Path.Combine(output_folder, "AutoGeneratedManuscriptProse.txt"); using (StreamWriter file = new StreamWriter(proseFilePath)) { @@ -685,14 +686,14 @@ protected void LoadModifications(string taskId, out List variableM } } - protected static void WritePsmsToTsv(IEnumerable psms, string filePath, IReadOnlyDictionary modstoWritePruned) + protected static void WritePsmsToTsv(IEnumerable psms, string filePath, IReadOnlyDictionary modstoWritePruned, bool writePeptideLevelResults = false) { using (StreamWriter output = new StreamWriter(filePath)) { output.WriteLine(SpectralMatch.GetTabSeparatedHeader()); foreach (var psm in psms) { - output.WriteLine(psm.ToString(modstoWritePruned)); + output.WriteLine(psm.ToString(modstoWritePruned, writePeptideLevelResults)); } } } diff --git a/MetaMorpheus/TaskLayer/MyTaskResults.cs b/MetaMorpheus/TaskLayer/MyTaskResults.cs index 22886931a..d792d91d2 100644 --- a/MetaMorpheus/TaskLayer/MyTaskResults.cs +++ b/MetaMorpheus/TaskLayer/MyTaskResults.cs @@ -3,6 +3,8 @@ using System.Linq; using System.Text; +using Easy.Common.Extensions; + namespace TaskLayer { public class MyTaskResults @@ -10,7 +12,7 @@ public class MyTaskResults public List NewSpectra; // calibration writes new calibrated spectra public List NewDatabases; // gptmd writes new annotated databases public List NewFileSpecificTomls; // calibration writes suggested ppm tolerances - public TimeSpan Time; + public TimeSpan Time; private readonly List resultTexts; @@ -27,11 +29,9 @@ public override string ToString() StringBuilder sb = new StringBuilder(); sb.AppendLine("Time to run task: " + Time); sb.AppendLine(); - sb.AppendLine(); sb.AppendLine("--------------------------------------------------"); if ((NewSpectra != null && NewSpectra.Any()) || (NewDatabases != null && NewDatabases.Any())) { - sb.AppendLine(); sb.AppendLine(); sb.AppendLine("New files:"); if (NewSpectra != null && NewSpectra.Any()) @@ -46,18 +46,14 @@ public override string ToString() sb.AppendLine(string.Join(Environment.NewLine + "\t", NewDatabases.Select(b => b.FilePath)).ToString()); } sb.AppendLine(); - sb.AppendLine(); sb.AppendLine("--------------------------------------------------"); } sb.AppendLine(); - sb.AppendLine(); sb.AppendLine(PsmPeptideProteinSummaryText.ToString()); sb.AppendLine(TaskSummaryText.ToString()); sb.AppendLine(); - sb.AppendLine(); sb.AppendLine("--------------------------------------------------"); sb.AppendLine(); - sb.AppendLine(); sb.AppendLine("Engine Results:"); sb.AppendLine(); foreach (var ok in resultTexts) diff --git a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs index 9cb70cb01..b025a6ef4 100644 --- a/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs +++ b/MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs @@ -29,12 +29,12 @@ public class PostSearchAnalysisTask : MetaMorpheusTask { public PostSearchAnalysisParameters Parameters { get; set; } private List ProteinGroups { get; set; } - private IEnumerable> PsmsGroupedByFile { get; set; } private SpectralRecoveryResults SpectralRecoveryResults { get; set; } - private List _filteredPsms; - private bool _pepFilteringNotPerformed; - private string _filterType; - private double _filterThreshold; + + /// + /// Used for storage of results for writing to Results.tsv. It is explained in the method ConstructResultsDictionary() + /// + private Dictionary<(string,string),string> ResultsDictionary { get; set; } public PostSearchAnalysisTask() : base(MyTask.Search) @@ -64,14 +64,13 @@ public MyTaskResults Run() { Parameters.AllPsms = Parameters.AllPsms.Where(psm => psm != null).ToList(); Parameters.AllPsms.ForEach(psm => psm.ResolveAllAmbiguities()); - Parameters.AllPsms = Parameters.AllPsms.OrderByDescending(b => b.Score) - .ThenBy(b => b.BioPolymerWithSetModsMonoisotopicMass.HasValue ? Math.Abs(b.ScanPrecursorMass - b.BioPolymerWithSetModsMonoisotopicMass.Value) : double.MaxValue) - .GroupBy(b => (b.FullFilePath, b.ScanNumber, b.BioPolymerWithSetModsMonoisotopicMass)).Select(b => b.First()).ToList(); - - CalculatePsmFdr(); + // The GroupBy statement below gets rid of duplicate PSMs that can occur when the same peptides is matched to the same spectrum multiple times, + // just with slightly different precursor masses. + Parameters.AllPsms = Parameters.AllPsms.OrderByDescending(b => b) + .GroupBy(b => (b.FullFilePath, b.ScanNumber, b.BioPolymerWithSetModsMonoisotopicMass)).Select(b => b.First()).ToList(); + CalculatePsmAndPeptideFdr(Parameters.AllPsms); } - - FilterAllPsms(); + ConstructResultsDictionary(); DoMassDifferenceLocalizationAnalysis(); ProteinAnalysis(); QuantificationAnalysis(); @@ -79,10 +78,19 @@ public MyTaskResults Run() ReportProgress(new ProgressEventArgs(100, "Done!", new List { Parameters.SearchTaskId, "Individual Spectra Files" })); HistogramAnalysis(); - WritePsmResults(); + WritePeptideResults(); + if (Parameters.CurrentRawFileList.Count > 1 && Parameters.SearchParameters.WriteIndividualFiles) + { + // create individual files subdirectory + Directory.CreateDirectory(Parameters.IndividualResultsOutputFolder); + WriteIndividualPsmResults(); + WriteIndividualPeptideResults(); + } WriteProteinResults(); + AddResultsTotalsToAllResultsTsv(); WritePrunedDatabase(); + var k = CommonParameters; if (Parameters.SearchParameters.WriteSpectralLibrary) { SpectralLibraryGeneration(); @@ -104,7 +112,6 @@ public MyTaskResults Run() WriteVariantResults(); } - WritePeptideResults(); // modifies the FDR results for PSMs, so do this last CompressIndividualFileResults(); return Parameters.SearchTaskResults; } @@ -115,114 +122,18 @@ protected override MyTaskResults RunSpecific(string OutputFolder, List - /// Sets the private field _filteredPsms by removing all psms with Q and Q_Notch or PEP_QValues greater - /// than a user defined threshold. Q-Value and PEP Q-Value filtering are mutually exculsive. - /// In cases where PEP filtering was selected but PEP wasn't performed due to insufficient PSMs, - /// filtering defaults to Q and Q_Notch. - /// _filteredPsms can be accessed through the GetFilteredPsms method. - /// Also, sets the PsmsGroupedByFile property. This is done here because filtering is performed every time - /// AllPsms is updated (i.e., in the Run method and during ProteinAnalysis w/ Silac labelling.) - /// - private void FilterAllPsms() - { - _filterType = "q-value"; - _filterThreshold = Math.Min(CommonParameters.QValueThreshold, CommonParameters.PepQValueThreshold); - - if (CommonParameters.PepQValueThreshold < CommonParameters.QValueThreshold) - { - if (Parameters.AllPsms.Count < 100) - { - _pepFilteringNotPerformed = true; - } - else - { - _filterType = "pep q-value"; - } - } - - _filteredPsms = _filterType.Equals("q-value") - ? Parameters.AllPsms.Where(p => - p.FdrInfo.QValue <= _filterThreshold - && p.FdrInfo.QValueNotch <= _filterThreshold) - .ToList() - : Parameters.AllPsms.Where(p => - p.FdrInfo.PEP_QValue <= _filterThreshold) - .ToList(); - - // This property is used for calculating file specific results, which requires calculating - // FDR separately for each file. Therefore, no filtering is performed - PsmsGroupedByFile = Parameters.AllPsms.GroupBy(p => p.FullFilePath); - } - - public IEnumerable GetFilteredPsms(bool includeDecoys, bool includeContaminants, - bool includeAmbiguous) - { - return _filteredPsms.Where(p => - (includeDecoys || !p.IsDecoy) - && (includeContaminants || !p.IsContaminant) - && (includeAmbiguous || p.FullSequence != null)); - } - - /// - /// Modifies a list of PSMs, removing all that should not be written to a results file. - /// - /// A list of PSMs to be modified in place - /// The number of target psms scoring below threshold - private void FilterSpecificPsms(List fileSpecificPsmsOrPeptides, out int psmOrPeptideCountForResults) - { - psmOrPeptideCountForResults = _filterType.Equals("q-value") - ? fileSpecificPsmsOrPeptides.Count(p => - !p.IsDecoy - && p.FdrInfo.QValue <= _filterThreshold - && p.FdrInfo.QValueNotch <= _filterThreshold) - : fileSpecificPsmsOrPeptides.Count(p => - !p.IsDecoy - && p.FdrInfo.PEP_QValue <= _filterThreshold); - - if (!Parameters.SearchParameters.WriteHighQValuePsms) - { - if (_filterType.Equals("q-value")) - { - fileSpecificPsmsOrPeptides.RemoveAll(p => - p.FdrInfo.QValue > _filterThreshold | - p.FdrInfo.QValueNotch > _filterThreshold); - } - else - { - fileSpecificPsmsOrPeptides.RemoveAll(p => - p.FdrInfo.PEP_QValue > _filterThreshold); - } - } - if (!Parameters.SearchParameters.WriteDecoys) - { - fileSpecificPsmsOrPeptides.RemoveAll(b => b.IsDecoy); - } - if (!Parameters.SearchParameters.WriteContaminants) - { - fileSpecificPsmsOrPeptides.RemoveAll(b => b.IsContaminant); - } - } - /// /// Calculate estimated false-discovery rate (FDR) for peptide spectral matches (PSMs) /// - private void CalculatePsmFdr() + private void CalculatePsmAndPeptideFdr(List psms, string analysisType = "PSM", bool doPep = true) { // TODO: because FDR is done before parsimony, if a PSM matches to a target and a decoy protein, there may be conflicts between how it's handled in parsimony and the FDR engine here // for example, here it may be treated as a decoy PSM, where as in parsimony it will be determined by the parsimony algorithm which is agnostic of target/decoy assignments // this could cause weird PSM FDR issues Status("Estimating PSM FDR...", Parameters.SearchTaskId); - new FdrAnalysisEngine(Parameters.AllPsms, Parameters.NumNotches, CommonParameters, this.FileSpecificParameters, new List { Parameters.SearchTaskId }, outputFolder: Parameters.OutputFolder).Run(); - - // sort by q-value because of group FDR stuff - // e.g. multiprotease FDR, non/semi-specific protease, etc - Parameters.AllPsms = Parameters.AllPsms - .OrderBy(p => p.FdrInfo.QValue) - .ThenByDescending(p => p.Score) - .ThenBy(p => p.FdrInfo.CumulativeTarget) - .ToList(); + new FdrAnalysisEngine(psms, Parameters.NumNotches, CommonParameters, this.FileSpecificParameters, + new List { Parameters.SearchTaskId }, analysisType: analysisType, doPEP: doPep, outputFolder: Parameters.OutputFolder).Run(); Status("Done estimating PSM FDR!", Parameters.SearchTaskId); } @@ -244,16 +155,20 @@ private void ProteinAnalysis() { psm.ResolveAllAmbiguities(); } - FilterAllPsms(); } - List psmsForProteinParsimony = Parameters.AllPsms; + var psmForParsimony = FilteredPsms.Filter(Parameters.AllPsms, + commonParams: CommonParameters, + includeDecoys: true, + includeContaminants: true, + includeAmbiguous: false, + includeHighQValuePsms: false); // run parsimony - ProteinParsimonyResults proteinAnalysisResults = (ProteinParsimonyResults)(new ProteinParsimonyEngine(psmsForProteinParsimony, Parameters.SearchParameters.ModPeptidesAreDifferent, CommonParameters, this.FileSpecificParameters, new List { Parameters.SearchTaskId }).Run()); + ProteinParsimonyResults proteinAnalysisResults = (ProteinParsimonyResults)(new ProteinParsimonyEngine(psmForParsimony.FilteredPsmsList, Parameters.SearchParameters.ModPeptidesAreDifferent, CommonParameters, this.FileSpecificParameters, new List { Parameters.SearchTaskId }).Run()); // score protein groups and calculate FDR - ProteinScoringAndFdrResults proteinScoringAndFdrResults = (ProteinScoringAndFdrResults)new ProteinScoringAndFdrEngine(proteinAnalysisResults.ProteinGroups, psmsForProteinParsimony, + ProteinScoringAndFdrResults proteinScoringAndFdrResults = (ProteinScoringAndFdrResults)new ProteinScoringAndFdrEngine(proteinAnalysisResults.ProteinGroups, psmForParsimony.FilteredPsmsList, Parameters.SearchParameters.NoOneHitWonders, Parameters.SearchParameters.ModPeptidesAreDifferent, true, CommonParameters, this.FileSpecificParameters, new List { Parameters.SearchTaskId }).Run(); ProteinGroups = proteinScoringAndFdrResults.SortedAndScoredProteinGroups; @@ -357,10 +272,13 @@ private void QuantificationAnalysis() } // get PSMs to pass to FlashLFQ - var unambiguousPsmsBelowOnePercentFdr = GetFilteredPsms( - includeDecoys: false, + var psmsForQuantification = FilteredPsms.Filter(Parameters.AllPsms, + CommonParameters, + includeDecoys: false, includeContaminants: true, - includeAmbiguous: false); + includeAmbiguous: false, + includeAmbiguousMods: false, + includeHighQValuePsms: false); // pass protein group info for each PSM var psmToProteinGroups = new Dictionary>(); @@ -391,7 +309,7 @@ private void QuantificationAnalysis() { // if protein groups were not constructed, just use accession numbers var accessionToPg = new Dictionary(); - foreach (var psm in unambiguousPsmsBelowOnePercentFdr) + foreach (var psm in psmsForQuantification) { var proteins = psm.BestMatchingBioPolymersWithSetMods.Select(b => b.Peptide.Parent).Distinct(); @@ -427,10 +345,10 @@ private void QuantificationAnalysis() //The number of psms should roughly increase by a factor of N, where N is the number of labels. //It may not increase exactly by a factor of N if the amino acid(s) that gets labeled doesn't exist in the peptide - List silacPsms = new(); //populate with duplicate psms for heavy/light + List silacPsms = new(); //populate with duplicate psms for heavy/light //multiply the psms by the number of labels - foreach (PeptideSpectralMatch psm in unambiguousPsmsBelowOnePercentFdr) + foreach (PeptideSpectralMatch psm in psmsForQuantification) { //get the original proteinGroup to give to the other psm clones List originalProteinGroups = psmToProteinGroups.ContainsKey(psm) ? psmToProteinGroups[psm] : new List(); @@ -536,18 +454,17 @@ private void QuantificationAnalysis() } //update the list for FlashLFQ silacPsms.ForEach(x => x.ResolveAllAmbiguities()); //update the monoisotopic mass - unambiguousPsmsBelowOnePercentFdr = silacPsms; + psmsForQuantification.SetSilacFilteredPsms(silacPsms); } //group psms by file - var psmsGroupedByFile = unambiguousPsmsBelowOnePercentFdr.GroupBy(p => p.FullFilePath); + var psmsGroupedByFile = psmsForQuantification.GroupBy(p => p.FullFilePath); // some PSMs may not have protein groups (if 2 peptides are required to construct a protein group, some PSMs will be left over) // the peptides should still be quantified but not considered for protein quantification var undefinedPg = new FlashLFQ.ProteinGroup("UNDEFINED", "", ""); //sort the unambiguous psms by protease to make MBR compatible with multiple proteases Dictionary> proteaseSortedPsms = new Dictionary>(); - Dictionary proteaseSortedFlashLFQResults = new Dictionary(); foreach (DigestionParams dp in Parameters.ListOfDigestionParams) { @@ -556,7 +473,7 @@ private void QuantificationAnalysis() proteaseSortedPsms.Add(dp.Protease, new List()); } } - foreach (var psm in unambiguousPsmsBelowOnePercentFdr) + foreach (var psm in psmsForQuantification) { if (!psmToProteinGroups.ContainsKey(psm)) { @@ -580,7 +497,7 @@ private void QuantificationAnalysis() } // run FlashLFQ - var FlashLfqEngine = new FlashLfqEngine( + var flashLfqEngine = new FlashLfqEngine( allIdentifications: flashLFQIdentifications, normalize: Parameters.SearchParameters.Normalize, ppmTolerance: Parameters.SearchParameters.QuantifyPpmTol, @@ -592,7 +509,7 @@ private void QuantificationAnalysis() if (flashLFQIdentifications.Any()) { - Parameters.FlashLfqResults = FlashLfqEngine.Run(); + Parameters.FlashLfqResults = flashLfqEngine.Run(); } // get protein intensity back from FlashLFQ @@ -629,15 +546,18 @@ private void HistogramAnalysis() { if (Parameters.SearchParameters.DoHistogramAnalysis) { - var limitedpsms_with_fdr = GetFilteredPsms( + var limitedpsms_with_fdr = FilteredPsms.Filter(Parameters.AllPsms, + commonParams: CommonParameters, includeDecoys: false, includeContaminants: true, - includeAmbiguous: true).ToList(); + includeAmbiguous: false, + includeHighQValuePsms: false); + if (limitedpsms_with_fdr.Any()) { Status("Running histogram analysis...", new List { Parameters.SearchTaskId }); var myTreeStructure = new BinTreeStructure(); - myTreeStructure.GenerateBins(limitedpsms_with_fdr, Parameters.SearchParameters.HistogramBinTolInDaltons); + myTreeStructure.GenerateBins(limitedpsms_with_fdr.FilteredPsmsList, Parameters.SearchParameters.HistogramBinTolInDaltons); var writtenFile = Path.Combine(Parameters.OutputFolder, "MassDifferenceHistogram.tsv"); WriteTree(myTreeStructure, writtenFile); FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId }); @@ -651,7 +571,7 @@ private void HistogramAnalysis() /// /// PSMs to be written /// Full file path, up to and including the filename and extensioh. - protected void WritePsmsToTsv(IEnumerable psms, string filePath) + protected void WritePsmsToTsv(IEnumerable psms, string filePath, bool writePeptideLevelResults = false) { if (Parameters.SearchParameters.DoMultiplexQuantification && Parameters.MultiplexModification != null && @@ -663,118 +583,156 @@ protected void WritePsmsToTsv(IEnumerable psms, string filePath) } else { - WritePsmsToTsv(psms, filePath, Parameters.SearchParameters.ModsToWriteSelection); + WritePsmsToTsv(psms, filePath, Parameters.SearchParameters.ModsToWriteSelection, writePeptideLevelResults); } } - private void WritePsmResults() { Status("Writing PSM results...", Parameters.SearchTaskId); - - var thresholdPsmList = GetFilteredPsms( + var psmsForPsmResults = FilteredPsms.Filter(Parameters.AllPsms, + CommonParameters, includeDecoys: Parameters.SearchParameters.WriteDecoys, includeContaminants: Parameters.SearchParameters.WriteContaminants, - includeAmbiguous: true).ToList(); - - // If filter output is false, we need to write all psms, not just ones with Q-value < threshold - List filteredPsmListForOutput = Parameters.SearchParameters.WriteHighQValuePsms - ? Parameters.AllPsms.Where(p => - (Parameters.SearchParameters.WriteDecoys || !p.IsDecoy) - && (Parameters.SearchParameters.WriteContaminants || !p.IsContaminant)) - .ToList() - : thresholdPsmList; + includeAmbiguous: true, + includeHighQValuePsms: Parameters.SearchParameters.WriteHighQValuePsms); // write PSMs string writtenFile = Path.Combine(Parameters.OutputFolder, "AllPSMs.psmtsv"); - WritePsmsToTsv(filteredPsmListForOutput, writtenFile); + WritePsmsToTsv(psmsForPsmResults.OrderByDescending(p=>p).ToList(), writtenFile, writePeptideLevelResults: false); FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId }); // write PSMs for percolator // percolator native read format is .tab writtenFile = Path.Combine(Parameters.OutputFolder, "AllPSMs_FormattedForPercolator.tab"); - WritePsmsForPercolator(filteredPsmListForOutput, writtenFile); + WritePsmsForPercolator(psmsForPsmResults.OrderByDescending(p=>p).ToList(), writtenFile); FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId }); - string filterType = _filterType ?? "q-value"; - double filterCutoffForResultsCounts = _filterThreshold; - int psmOrPeptideCountForResults = thresholdPsmList.Count(p => !p.IsDecoy); - // write summary text - if (_pepFilteringNotPerformed) + if (psmsForPsmResults.FilteringNotPerformed) { + Parameters.SearchTaskResults.AddPsmPeptideProteinSummaryText( "PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value." + Environment.NewLine); } - Parameters.SearchTaskResults.AddPsmPeptideProteinSummaryText( - "All target PSMs with " + filterType + " = " + Math.Round(filterCutoffForResultsCounts, 2) + ": " + - psmOrPeptideCountForResults + Environment.NewLine); + string psmResultsText = "All target PSMs with " + psmsForPsmResults.FilterType + " = " + Math.Round(psmsForPsmResults.FilterThreshold, 2) + ": " + + psmsForPsmResults.TargetPsmsAboveThreshold; + ResultsDictionary[("All", "PSMs")] = psmResultsText; + } + private void WritePeptideResults() + { + Status("Writing peptide results...", Parameters.SearchTaskId); - if (Parameters.SearchParameters.DoParsimony) + var peptidesForPeptideResults = FilteredPsms.Filter(Parameters.AllPsms, + CommonParameters, + includeDecoys: Parameters.SearchParameters.WriteDecoys, + includeContaminants: Parameters.SearchParameters.WriteContaminants, + includeAmbiguous: true, + includeHighQValuePsms: Parameters.SearchParameters.WriteHighQValuePsms, + filterAtPeptideLevel: true); + + // write PSMs + string writtenFile = Path.Combine(Parameters.OutputFolder, "AllPeptides.psmtsv"); + WritePsmsToTsv(peptidesForPeptideResults.OrderByDescending(p => p).ToList(), writtenFile, writePeptideLevelResults: true); + FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId }); + + // write summary text + if (peptidesForPeptideResults.FilteringNotPerformed) { - Parameters.SearchTaskResults.AddTaskSummaryText( - "All target protein groups with q-value = 0.01 (1% FDR): " + - ProteinGroups.Count(b => b.QValue <= 0.01 && !b.IsDecoy) + - Environment.NewLine); + Parameters.SearchTaskResults.AddPsmPeptideProteinSummaryText( + "PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value." + Environment.NewLine); } + string peptideResultsText = "All target peptides with " + peptidesForPeptideResults.FilterType + " = " + Math.Round(peptidesForPeptideResults.FilterThreshold, 2) + ": " + + peptidesForPeptideResults.TargetPsmsAboveThreshold; + ResultsDictionary[("All", "Peptides")] = peptideResultsText; + } - if (Parameters.CurrentRawFileList.Count > 1 && Parameters.SearchParameters.WriteIndividualFiles) + private void WriteIndividualPsmResults() + { + Status("Writing Individual PSM results...", Parameters.SearchTaskId); + + var psmsGroupedByFile = Parameters.AllPsms.GroupBy(p => p.FullFilePath); + foreach (var psmFileGroup in psmsGroupedByFile) { - // create individual files subdirectory - Directory.CreateDirectory(Parameters.IndividualResultsOutputFolder); - foreach (var psmFileGroup in PsmsGroupedByFile) - { - // FDR Analysis is performed again for each file. File specific results show the results that would be - // generated by analyzing one file by itself. Therefore, the FDR info should change between AllResults and FileSpecific - string strippedFileName = Path.GetFileNameWithoutExtension(psmFileGroup.Key); - var psmsForThisFile = psmFileGroup.ToList(); - new FdrAnalysisEngine(psmsForThisFile, Parameters.NumNotches, CommonParameters, FileSpecificParameters, - new List { Parameters.SearchTaskId }, doPEP: false).Run(); - - FilterSpecificPsms(psmsForThisFile, out psmOrPeptideCountForResults); - - // write summary text - Parameters.SearchTaskResults.AddTaskSummaryText("MS2 spectra in " + strippedFileName + ": " + Parameters.NumMs2SpectraPerFile[strippedFileName][0]); - Parameters.SearchTaskResults.AddTaskSummaryText("Precursors fragmented in " + strippedFileName + ": " + Parameters.NumMs2SpectraPerFile[strippedFileName][1]); - Parameters.SearchTaskResults.AddTaskSummaryText(strippedFileName + " target PSMs with " + filterType + " = " + - Math.Round(filterCutoffForResultsCounts, 2) + ": " + psmOrPeptideCountForResults + Environment.NewLine); - - // write PSMs - writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_PSMs.psmtsv"); - WritePsmsToTsv(psmsForThisFile, writtenFile); - FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); - - // write PSMs for percolator - writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_PSMsFormattedForPercolator.tab"); - WritePsmsForPercolator(psmsForThisFile, writtenFile); - FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); - } + // FDR Analysis is performed again for each file. File specific results show the results that would be + // generated by analyzing one file by itself. Therefore, the FDR info should change between AllResults and FileSpecific + string strippedFileName = Path.GetFileNameWithoutExtension(psmFileGroup.Key); + var psmsForThisFile = psmFileGroup.ToList(); + CalculatePsmAndPeptideFdr(psmsForThisFile,"PSM", false); + var psmsToWrite = FilteredPsms.Filter(psmsForThisFile, + CommonParameters, + includeDecoys: Parameters.SearchParameters.WriteDecoys, + includeContaminants: Parameters.SearchParameters.WriteContaminants, + includeAmbiguous: true, + includeHighQValuePsms: Parameters.SearchParameters.WriteHighQValuePsms); + + int count = psmsToWrite.Where(psm => psm.PsmFdrInfo.PEP <= 0.01).Count(); + + // write PSMs + string writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_PSMs.psmtsv"); + WritePsmsToTsv(psmsToWrite, writtenFile); + FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); + + // write PSMs for percolator + writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_PSMsFormattedForPercolator.tab"); + WritePsmsForPercolator(psmsToWrite.FilteredPsmsList, writtenFile); + FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); + + // write summary text + string psmResultsText = strippedFileName + " - All target PSMs with " + psmsToWrite.FilterType + " = " + Math.Round(psmsToWrite.FilterThreshold, 2) + ": " + + psmsToWrite.TargetPsmsAboveThreshold; + ResultsDictionary[(strippedFileName, "PSMs")] = psmResultsText; } - } + private void WriteIndividualPeptideResults() + { + Status("Writing Individual Peptide results...", Parameters.SearchTaskId); + + var peptidesGroupedByFile = Parameters.AllPsms.GroupBy(p => p.FullFilePath); + foreach (var psmFileGroup in peptidesGroupedByFile) + { + var peptideFileGroup = psmFileGroup + .OrderByDescending(p => p) + .GroupBy(p => p.FullSequence) + .Select(group => group.FirstOrDefault()) + .ToList(); + // FDR Analysis is performed again for each file. File specific results show the results that would be + // generated by analyzing one file by itself. Therefore, the FDR info should change between AllResults and FileSpecific + string strippedFileName = Path.GetFileNameWithoutExtension(psmFileGroup.Key); + CalculatePsmAndPeptideFdr(peptideFileGroup, "peptide", false); + var peptidesToWrite = FilteredPsms.Filter(peptideFileGroup, + CommonParameters, + includeDecoys: Parameters.SearchParameters.WriteDecoys, + includeContaminants: Parameters.SearchParameters.WriteContaminants, + includeAmbiguous: true, + includeHighQValuePsms: Parameters.SearchParameters.WriteHighQValuePsms, + filterAtPeptideLevel: true); + + // write PSMs + string writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_Peptides.psmtsv"); + WritePsmsToTsv(peptidesToWrite, writtenFile, writePeptideLevelResults: true); + FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", psmFileGroup.Key }); + + // write summary text + string peptideResultsText = strippedFileName + " - All target peptides with " + peptidesToWrite.FilterType + " = " + Math.Round(peptidesToWrite.FilterThreshold, 2) + ": " + + peptidesToWrite.TargetPsmsAboveThreshold; + ResultsDictionary[(strippedFileName, "Peptides")] = peptideResultsText; + } + + } private void UpdateSpectralLibrary() { - var filteredPsmList = GetFilteredPsms( + var peptidesForSpectralLibrary = FilteredPsms.Filter(Parameters.AllPsms, + CommonParameters, includeDecoys: false, includeContaminants: false, - includeAmbiguous: false); - - //group psms by peptide and charge, the psms having same sequence and same charge will be in the same group - Dictionary<(String, int), List> PsmsGroupByPeptideAndCharge = new Dictionary<(String, int), List>(); - foreach (var x in filteredPsmList) - { - List psmsWithSamePeptideAndSameCharge = filteredPsmList.Where(b => b.FullSequence == x.FullSequence && b.ScanPrecursorCharge == x.ScanPrecursorCharge).OrderByDescending(p => p.Score).ToList(); - (String, int) peptideWithChargeState = (x.FullSequence, x.ScanPrecursorCharge); + includeAmbiguous: false, + includeHighQValuePsms: false); - if (!PsmsGroupByPeptideAndCharge.ContainsKey(peptideWithChargeState)) - { - PsmsGroupByPeptideAndCharge.Add(peptideWithChargeState, psmsWithSamePeptideAndSameCharge); - } - } //group psms by peptide and charge, then write highest scoring PSM to dictionary - Dictionary<(string, int), SpectralMatch> psmSeqChargeDictionary = filteredPsmList + Dictionary<(string, int), SpectralMatch> psmSeqChargeDictionary = peptidesForSpectralLibrary .GroupBy(p => (p.FullSequence, p.ScanPrecursorCharge)) .ToDictionary( // Key is a (FullSequence, Charge) tuple @@ -824,8 +782,7 @@ private void UpdateSpectralLibrary() string updatedSpectralLibrary = UpdateSpectralLibrary(updatedLibrarySpectra, Parameters.OutputFolder); - Parameters.SearchTaskResults.NewDatabases = new List(); - Parameters.SearchTaskResults.NewDatabases.Add(new DbForTask(updatedSpectralLibrary, false)); + Parameters.SearchTaskResults.NewDatabases = new List { new DbForTask(updatedSpectralLibrary, false) }; DbForTask originalFastaDb = Parameters.DatabaseFilenameList.Where(p => p.IsSpectralLibrary == false && p.IsContaminant == false).First(); Parameters.SearchTaskResults.NewDatabases.Add(originalFastaDb); @@ -834,13 +791,15 @@ private void UpdateSpectralLibrary() //for those spectra matching the same peptide/protein with same charge, save the one with highest score private void SpectralLibraryGeneration() { - var filteredPsmList = GetFilteredPsms( + var peptidesForSpectralLibrary = FilteredPsms.Filter(Parameters.AllPsms, + CommonParameters, includeDecoys: false, includeContaminants: false, - includeAmbiguous: false); + includeAmbiguous: false, + includeHighQValuePsms: false); //group psms by peptide and charge, the psms having same sequence and same charge will be in the same group - var fullSeqChargeGrouping = filteredPsmList.GroupBy(p => (p.FullSequence, p.ScanPrecursorCharge)); + var fullSeqChargeGrouping = peptidesForSpectralLibrary.GroupBy(p => (p.FullSequence, p.ScanPrecursorCharge)); List spectraLibrary = new(); foreach (var matchGroup in fullSeqChargeGrouping) { @@ -856,140 +815,160 @@ private void SpectralLibraryGeneration() WriteSpectrumLibrary(spectraLibrary, Parameters.OutputFolder); } + private void WriteProteinResults() { - if (Parameters.SearchParameters.DoParsimony) + if (!Parameters.SearchParameters.DoParsimony) { - string fileName = "AllProteinGroups.tsv"; - - if (Parameters.SearchParameters.DoLabelFreeQuantification) - { - fileName = "AllQuantifiedProteinGroups.tsv"; - } + return; + } + else + { + string proteinResultsText = "All target protein groups with q-value = 0.01 (1% FDR): " + ProteinGroups.Count(b => b.QValue <= 0.01 && !b.IsDecoy); + ResultsDictionary[("All", "Proteins")] = proteinResultsText; + } + + string fileName = "AllProteinGroups.tsv"; + if (Parameters.SearchParameters.DoLabelFreeQuantification) + { + fileName = "AllQuantifiedProteinGroups.tsv"; + } - //set peptide output values - ProteinGroups.ForEach(x => x.GetIdentifiedPeptidesOutput(Parameters.SearchParameters.SilacLabels)); - // write protein groups to tsv - string writtenFile = Path.Combine(Parameters.OutputFolder, fileName); - WriteProteinGroupsToTsv(ProteinGroups, writtenFile, new List { Parameters.SearchTaskId } ); + //set peptide output values + ProteinGroups.ForEach(x => x.GetIdentifiedPeptidesOutput(Parameters.SearchParameters.SilacLabels)); + // write protein groups to tsv + string writtenFile = Path.Combine(Parameters.OutputFolder, fileName); + WriteProteinGroupsToTsv(ProteinGroups, writtenFile, new List { Parameters.SearchTaskId }); - // write all individual file results to subdirectory - // local protein fdr, global parsimony, global psm fdr - if (Parameters.CurrentRawFileList.Count > 1 && (Parameters.SearchParameters.WriteIndividualFiles - || Parameters.SearchParameters.WriteMzId || Parameters.SearchParameters.WritePepXml)) - { - Directory.CreateDirectory(Parameters.IndividualResultsOutputFolder); - } + if (Parameters.CurrentRawFileList.Count > 1 && (Parameters.SearchParameters.WriteIndividualFiles + || Parameters.SearchParameters.WriteMzId || + Parameters.SearchParameters.WritePepXml)) + { + Directory.CreateDirectory(Parameters.IndividualResultsOutputFolder); + } + var psmsGroupedByFile = FilteredPsms.Filter(Parameters.AllPsms, + CommonParameters, + includeDecoys: true, + includeContaminants: true, + includeAmbiguous: true, + includeHighQValuePsms: false).FilteredPsmsList.GroupBy(f => f.FullFilePath); - //If doing a SILAC search and no "unlabeled" labels were specified (i.e. multiple labels are used for multiplexing and no conditions are "unlabeled"), - //then we need to update the psms (which were found in the data file that has the "unlabeled" named) and say they were found in the "heavy" file) - if (Parameters.SearchParameters.SilacLabels != null) //if we have silac labels + //if we're writing individual files, we need to reprocess the psms + //If doing a SILAC search and no "unlabeled" labels were specified (i.e. multiple labels are used for multiplexing and no conditions are "unlabeled"), + //then we need to update the psms (which were found in the data file that has the "unlabeled" named) and say they were found in the "heavy" file) + if (Parameters.SearchParameters.SilacLabels != null) //if we have silac labels + { + //get the original filenames + List fileNamesThatHadPsms = psmsGroupedByFile.Select(v => v.Key).ToList(); + EngineLayer.ProteinGroup firstProteinGroup = ProteinGroups.FirstOrDefault(); //grab the first protein to extract the files used for quantification + if (firstProteinGroup != null) //check that we even have a protein group to write { - //get the original filenames - List fileNamesThatHadPsms = PsmsGroupedByFile.Select(v => v.Key).ToList(); - EngineLayer.ProteinGroup firstProteinGroup = ProteinGroups.FirstOrDefault(); //grab the first protein to extract the files used for quantification - if (firstProteinGroup != null) //check that we even have a protein group to write + var tempPsmsGroupedByFile = new List>(); + //foreach original file + foreach (string originalFile in fileNamesThatHadPsms) { - var tempPsmsGroupedByFile = new List>(); - //foreach original file - foreach (string originalFile in fileNamesThatHadPsms) + //get all the "filenames" output by quantification. If no unlabeled condition was specified, the original datafile will not be present in the current grouping + //Example: the datafile "test.mzml" that was searched with +4 or +10 neutron mass difference on arginine would appear as "test(R+4).mzml" and "test(R+10).mzml". + //there would be no "test.mzml" + List labeledFiles = new List { originalFile }; + foreach (SilacLabel label in Parameters.SearchParameters.SilacLabels) { - //get all the "filenames" output by quantification. If no unlabeled condition was specified, the original datafile will not be present in the current grouping - //Example: the datafile "test.mzml" that was searched with +4 or +10 neutron mass difference on arginine would appear as "test(R+4).mzml" and "test(R+10).mzml". - //there would be no "test.mzml" - List labeledFiles = new List { originalFile }; - foreach (SilacLabel label in Parameters.SearchParameters.SilacLabels) - { - //rediscover the previous naming conversion(s) - labeledFiles.Add(SilacConversions.GetHeavyFileInfo(new SpectraFileInfo(originalFile, "", 0, 0, 0), label).FullFilePathWithExtension); - } - - //rename the file group for all of the relevant psms to their original file - List psms = PsmsGroupedByFile.Where(g => labeledFiles.Contains(g.Key)).SelectMany(x => x).ToList(); //grab all the psms - tempPsmsGroupedByFile.AddRange(psms.GroupBy(x => originalFile)); + //rediscover the previous naming conversion(s) + labeledFiles.Add(SilacConversions.GetHeavyFileInfo(new SpectraFileInfo(originalFile, "", 0, 0, 0), label).FullFilePathWithExtension); } - //overwrite the grouping for downstream processing - PsmsGroupedByFile = tempPsmsGroupedByFile.ToList(); + + //rename the file group for all of the relevant psms to their original file + List psms = psmsGroupedByFile.Where(g => labeledFiles.Contains(g.Key)).SelectMany(x => x).ToList(); //grab all the psms + tempPsmsGroupedByFile.AddRange(psms.GroupBy(x => originalFile)); } + //overwrite the grouping for downstream processing + psmsGroupedByFile = tempPsmsGroupedByFile.ToList(); } + } - //write the individual result files for each datafile - foreach (var fullFilePath in PsmsGroupedByFile.Select(v => v.Key)) - { - string strippedFileName = Path.GetFileNameWithoutExtension(fullFilePath); - - List psmsForThisFile = PsmsGroupedByFile.Where(p => p.Key == fullFilePath).SelectMany(g => g).ToList(); - var subsetProteinGroupsForThisFile = ProteinGroups.Select(p => p.ConstructSubsetProteinGroup(fullFilePath, Parameters.SearchParameters.SilacLabels)).ToList(); + //write the individual result files for each datafile + foreach (var fullFilePath in psmsGroupedByFile.Select(v => v.Key)) + { + string strippedFileName = Path.GetFileNameWithoutExtension(fullFilePath); - ProteinScoringAndFdrResults subsetProteinScoringAndFdrResults = (ProteinScoringAndFdrResults)new ProteinScoringAndFdrEngine(subsetProteinGroupsForThisFile, psmsForThisFile, - Parameters.SearchParameters.NoOneHitWonders, Parameters.SearchParameters.ModPeptidesAreDifferent, - false, CommonParameters, this.FileSpecificParameters, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }).Run(); + List psmsForThisFile = psmsGroupedByFile.Where(p => p.Key == fullFilePath).SelectMany(g => g).ToList(); + var subsetProteinGroupsForThisFile = ProteinGroups.Select(p => p.ConstructSubsetProteinGroup(fullFilePath, Parameters.SearchParameters.SilacLabels)).ToList(); - subsetProteinGroupsForThisFile = subsetProteinScoringAndFdrResults.SortedAndScoredProteinGroups; + ProteinScoringAndFdrResults subsetProteinScoringAndFdrResults = (ProteinScoringAndFdrResults)new ProteinScoringAndFdrEngine(subsetProteinGroupsForThisFile, psmsForThisFile, + Parameters.SearchParameters.NoOneHitWonders, Parameters.SearchParameters.ModPeptidesAreDifferent, + false, CommonParameters, this.FileSpecificParameters, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }).Run(); - Parameters.SearchTaskResults.AddTaskSummaryText("Target protein groups within 1 % FDR in " + strippedFileName + ": " + subsetProteinGroupsForThisFile.Count(b => b.QValue <= 0.01 && !b.IsDecoy)); + subsetProteinGroupsForThisFile = subsetProteinScoringAndFdrResults.SortedAndScoredProteinGroups; - // write individual spectra file protein groups results to tsv - if (Parameters.SearchParameters.WriteIndividualFiles && Parameters.CurrentRawFileList.Count > 1) - { - writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_ProteinGroups.tsv"); - WriteProteinGroupsToTsv(subsetProteinGroupsForThisFile, writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); - } + if (Parameters.SearchParameters.WriteIndividualFiles && Parameters.CurrentRawFileList.Count > 1) + { + writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + "_ProteinGroups.tsv"); + WriteProteinGroupsToTsv(subsetProteinGroupsForThisFile, writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); + } - FilterSpecificPsms(psmsForThisFile, out int count); // Filter psms in place before writing - // write mzID - if (Parameters.SearchParameters.WriteMzId) - { - Status("Writing mzID...", new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); + // write summary text + string proteinResultsText = strippedFileName + " - Target protein groups within 1 % FDR: " + subsetProteinGroupsForThisFile.Count(b => b.QValue <= 0.01 && !b.IsDecoy); + ResultsDictionary[(strippedFileName, "Proteins")] = proteinResultsText; - string mzidFilePath = Path.Combine(Parameters.OutputFolder, strippedFileName + ".mzID"); - if (Parameters.CurrentRawFileList.Count > 1) - { - mzidFilePath = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + ".mzID"); - } + psmsForThisFile = FilteredPsms.Filter(psmsForThisFile, + CommonParameters, + includeDecoys: Parameters.SearchParameters.WriteDecoys, + includeContaminants: Parameters.SearchParameters.WriteContaminants, + includeAmbiguous: true, + includeHighQValuePsms: true).FilteredPsmsList; - - MzIdentMLWriter.WriteMzIdentMl( - psmsForThisFile, - subsetProteinGroupsForThisFile, - Parameters.VariableModifications, - Parameters.FixedModifications, - Parameters.SearchParameters.SilacLabels, - new List { CommonParameters.DigestionParams.Protease }, - CommonParameters.ProductMassTolerance, - CommonParameters.PrecursorMassTolerance, - CommonParameters.DigestionParams.MaxMissedCleavages, - mzidFilePath, - Parameters.SearchParameters.IncludeModMotifInMzid); - - FinishedWritingFile(mzidFilePath, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); - } + // Filter psms in place before writing mzID + if (Parameters.SearchParameters.WriteMzId) + { + Status("Writing mzID...", new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); - // write pepXML - if (Parameters.SearchParameters.WritePepXml) + string mzidFilePath = Path.Combine(Parameters.OutputFolder, strippedFileName + ".mzID"); + if (Parameters.CurrentRawFileList.Count > 1) { - Status("Writing pepXML...", new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); + mzidFilePath = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + ".mzID"); + } - string pepXMLFilePath = Path.Combine(Parameters.OutputFolder, strippedFileName + ".pep.XML"); - if (Parameters.CurrentRawFileList.Count > 1) - { - pepXMLFilePath = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + ".pep.XML"); - } + MzIdentMLWriter.WriteMzIdentMl( + psmsForThisFile, + subsetProteinGroupsForThisFile, + Parameters.VariableModifications, + Parameters.FixedModifications, + Parameters.SearchParameters.SilacLabels, + new List { CommonParameters.DigestionParams.Protease }, + CommonParameters.ProductMassTolerance, + CommonParameters.PrecursorMassTolerance, + CommonParameters.DigestionParams.MaxMissedCleavages, + mzidFilePath, + Parameters.SearchParameters.IncludeModMotifInMzid); + + FinishedWritingFile(mzidFilePath, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); + } - PepXMLWriter.WritePepXml(psmsForThisFile, - Parameters.DatabaseFilenameList, - Parameters.VariableModifications, - Parameters.FixedModifications, - CommonParameters, pepXMLFilePath); + // write pepXML + if (Parameters.SearchParameters.WritePepXml) + { + Status("Writing pepXML...", new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); - FinishedWritingFile(pepXMLFilePath, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); + string pepXMLFilePath = Path.Combine(Parameters.OutputFolder, strippedFileName + ".pep.XML"); + if (Parameters.CurrentRawFileList.Count > 1) + { + pepXMLFilePath = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + ".pep.XML"); } - ReportProgress(new ProgressEventArgs(100, "Done!", new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath })); + PepXMLWriter.WritePepXml(psmsForThisFile, + Parameters.DatabaseFilenameList, + Parameters.VariableModifications, + Parameters.FixedModifications, + CommonParameters, pepXMLFilePath); + + FinishedWritingFile(pepXMLFilePath, new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath }); } + + ReportProgress(new ProgressEventArgs(100, "Done!", new List { Parameters.SearchTaskId, "Individual Spectra Files", fullFilePath })); } } + private void WriteFlashLFQResults() { if (Parameters.SearchParameters.DoLabelFreeQuantification && Parameters.FlashLfqResults != null) @@ -1036,16 +1015,17 @@ private void WritePrunedDatabase() HashSet modificationsToWriteIfInDatabase = new HashSet(); HashSet modificationsToWriteIfObserved = new HashSet(); - var confidentPsms = GetFilteredPsms( - includeDecoys: false, - includeContaminants: true, - includeAmbiguous: true) - .Where(p => p.BaseSequence != null) - .ToList(); + var filteredPsms = FilteredPsms.Filter(Parameters.AllPsms, + CommonParameters, + includeDecoys: false, + includeContaminants: true, + includeAmbiguous: false, + includeHighQValuePsms: false); + var proteinToConfidentBaseSequences = new Dictionary>(); // associate all confident PSMs with all possible proteins they could be digest products of (before or after parsimony) - foreach (SpectralMatch psm in confidentPsms) + foreach (SpectralMatch psm in filteredPsms) { var myPepsWithSetMods = psm.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide); @@ -1083,19 +1063,22 @@ private void WritePrunedDatabase() } //generates dictionary of proteins with only localized modifications - var modPsms = GetFilteredPsms( - includeDecoys: false, - includeContaminants: true, - includeAmbiguous: false).ToList(); + var originalModPsms = FilteredPsms.Filter(filteredPsms, + CommonParameters, + includeDecoys: false, + includeContaminants: true, + includeAmbiguous: false, + includeAmbiguousMods: false, + includeHighQValuePsms: false); + - var originalModPsms = Parameters.AllPsms.Where(b => b.FdrInfo.QValueNotch <= 0.01 && b.FdrInfo.QValue <= 0.01 && !b.IsDecoy && b.FullSequence != null).ToList(); var proteinToConfidentModifiedSequences = new Dictionary>(); - HashSet modPsmsFullSeq = modPsms.Select(p => p.FullSequence).ToHashSet(); + HashSet modPsmsFullSeq = originalModPsms.Select(p => p.FullSequence).ToHashSet(); HashSet originalModPsmsFullSeq = originalModPsms.Select(p => p.FullSequence).ToHashSet(); modPsmsFullSeq.ExceptWith(originalModPsmsFullSeq); - foreach (SpectralMatch psm in modPsms) + foreach (SpectralMatch psm in originalModPsms) { var myPepsWithSetMods = psm.BestMatchingBioPolymersWithSetMods.Select(p => p.Peptide); @@ -1309,66 +1292,7 @@ private void WritePrunedDatabase() } } - private void WritePeptideResults() - { - Status("Writing peptide results...", Parameters.SearchTaskId); - - // write best (highest-scoring) PSM per peptide - string filename = "All" + GlobalVariables.AnalyteType + "s.psmtsv"; - string writtenFile = Path.Combine(Parameters.OutputFolder, filename); - List peptides = Parameters.AllPsms - .GroupBy(b => b.FullSequence) - .Select(b => b.FirstOrDefault()).ToList(); - - new FdrAnalysisEngine(peptides, Parameters.NumNotches, CommonParameters, - FileSpecificParameters, new List { Parameters.SearchTaskId }, - "Peptide", doPEP: false).Run(); - - FilterSpecificPsms(peptides, out int psmOrPeptideCountForResults); - - WritePsmsToTsv(peptides, writtenFile); - FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId }); - - Parameters.SearchTaskResults.AddPsmPeptideProteinSummaryText( - "All target " + GlobalVariables.AnalyteType.ToLower() + "s with " + _filterType + - " = " + Math.Round(_filterThreshold,2) + " : " + psmOrPeptideCountForResults); - if (Parameters.CurrentRawFileList.Count > 1 && Parameters.SearchParameters.WriteIndividualFiles) - { - // create individual files subdirectory - Directory.CreateDirectory(Parameters.IndividualResultsOutputFolder); - foreach (var file in PsmsGroupedByFile) - { - // write summary text - var psmsForThisFile = file.ToList(); - string strippedFileName = Path.GetFileNameWithoutExtension(file.First().FullFilePath); - var peptidesForFile = psmsForThisFile - .GroupBy(b => b.FullSequence) - .Select(b => b.FirstOrDefault()) - .OrderByDescending(b => b.Score) - .ToList(); - - // FDR Analysis is performed again for each file. File specific results show the results that would be - // generated by analyzing one file by itself. Therefore, the FDR info should change between AllResults and FileSpecific - new FdrAnalysisEngine(peptidesForFile, Parameters.NumNotches, CommonParameters, FileSpecificParameters, - new List { Parameters.SearchTaskId }, "Peptide", doPEP: false).Run(); - - FilterSpecificPsms(peptidesForFile, out psmOrPeptideCountForResults); - - Parameters.SearchTaskResults.AddTaskSummaryText( - strippedFileName + " Target " + GlobalVariables.AnalyteType.ToLower() + "s with " - + _filterType + " = " + Math.Round(_filterThreshold, 2) - + " : " + psmOrPeptideCountForResults + Environment.NewLine); - - // write best (highest-scoring) PSM per peptide - filename = "_" + GlobalVariables.AnalyteType + "s.psmtsv"; - writtenFile = Path.Combine(Parameters.IndividualResultsOutputFolder, strippedFileName + filename); - WritePsmsToTsv(peptidesForFile, writtenFile); - FinishedWritingFile(writtenFile, new List { Parameters.SearchTaskId, "Individual Spectra Files", file.First().FullFilePath }); - } - } - } - - private void WritePsmPlusMultiplexIons(IEnumerable psms, string filePath) + private void WritePsmPlusMultiplexIons(IEnumerable psms, string filePath, bool writePeptideLevelResults = false) { PpmTolerance ionTolerance = new PpmTolerance(10); double[] reporterIonMzs = Parameters.MultiplexModification.DiagnosticIons.First().Value @@ -1387,7 +1311,7 @@ private void WritePsmPlusMultiplexIons(IEnumerable psms, string f GetMultiplexIonIntensities(psm.MsDataScan.MassSpectrum, reporterIonMzs, ionTolerance) .Select(d => d.ToString(CultureInfo.CurrentCulture)); - output.Write(psm.ToString(Parameters.SearchParameters.ModsToWriteSelection).Trim()); + output.Write(psm.ToString(Parameters.SearchParameters.ModsToWriteSelection, writePeptideLevelResults).Trim()); output.Write('\t'); output.WriteLine(String.Join('\t', labelIonIntensities)); } @@ -1480,20 +1404,21 @@ private void WriteVariantResults() string filename = "Variant" + GlobalVariables.AnalyteType + "s.psmtsv"; string variantPeptideFile = Path.Combine(Parameters.OutputFolder, filename); - var fdrPsms = GetFilteredPsms( - includeDecoys: Parameters.SearchParameters.WriteDecoys, - includeContaminants: Parameters.SearchParameters.WriteContaminants, - includeAmbiguous: true) - .Where(p => p.BaseSequence != null) - .ToList(); + var fdrPsms = FilteredPsms.Filter(Parameters.AllPsms, + CommonParameters, + includeDecoys: true, + includeContaminants: true, + includeAmbiguous: true, + includeHighQValuePsms: false, + filterAtPeptideLevel: true); - var possibleVariantPsms = fdrPsms.Where(p => + var possibleVariantPsms = fdrPsms.Where(p => p.BestMatchingBioPolymersWithSetMods.Any(pep => pep.Peptide is PeptideWithSetModifications pwsm && pwsm.IsVariantPeptide())) .OrderByDescending(pep => pep.Score) .ToList(); new FdrAnalysisEngine(possibleVariantPsms, Parameters.NumNotches, CommonParameters, FileSpecificParameters, - new List { Parameters.SearchTaskId }, "variant_PSMs").Run(); + new List { Parameters.SearchTaskId }, "variant_PSMs", doPEP: false).Run(); possibleVariantPsms .OrderBy(p => p.FdrInfo.QValue) @@ -1501,7 +1426,6 @@ private void WriteVariantResults() .ThenBy(p => p.FdrInfo.CumulativeTarget) .ToList(); - FilterSpecificPsms(possibleVariantPsms, out int countOfConfidentPsms); WritePsmsToTsv(possibleVariantPsms, variantPsmFile); List variantPeptides = possibleVariantPsms @@ -1512,7 +1436,7 @@ private void WriteVariantResults() List confidentVariantPeps = new List(); new FdrAnalysisEngine(variantPeptides, Parameters.NumNotches, CommonParameters, FileSpecificParameters, - new List { Parameters.SearchTaskId }, "variant_Peptides").Run(); + new List { Parameters.SearchTaskId }, "variant_Peptides", doPEP: false).Run(); WritePsmsToTsv(variantPeptides, variantPeptideFile); @@ -1545,9 +1469,14 @@ private void WriteVariantResults() Dictionary> stopGainVariants = new(); Dictionary> stopLossVariants = new(); - FilterSpecificPsms(confidentVariantPeps, out int countOfConfidentPeptides); // Filter psms in place + var filteredVariants = FilteredPsms.Filter(confidentVariantPeps, + CommonParameters, + includeDecoys: false, + includeContaminants: false, + includeAmbiguous: false, + includeHighQValuePsms: false); - List modifiedVariantPeptides = confidentVariantPeps + List modifiedVariantPeptides = filteredVariants .Where(p => p.ModsIdentified != null && p.ModsIdentified.Count > 0 && p is PeptideSpectralMatch) .Select(p => (PeptideSpectralMatch)p) .ToList(); //modification can be on any AA in variant peptide @@ -1765,25 +1694,25 @@ private void WriteVariantResults() string[] variantResults = new string[25]; variantResults[0] = "Variant Result Summary"; variantResults[2] = "--------------------------------------------------"; - variantResults[4] = "Number of potential variant containing peptides identified at " + _filterThreshold * 100 + "% group FDR: " + countOfConfidentPsms; - variantResults[5] = "Number of unqiuely identified variant peptides at " + _filterThreshold * 100 + "% group FDR: " + countOfConfidentPeptides; + variantResults[4] = "Number of potential variant containing peptides identified at " + fdrPsms.FilterThreshold * 100 + "% group FDR: " + fdrPsms.TargetPsmsAboveThreshold; + variantResults[5] = "Number of unqiuely identified variant peptides at " + filteredVariants.FilterThreshold * 100 + "% group FDR: " + filteredVariants.TargetPsmsAboveThreshold; variantResults[6] = "Number of unique variants: " + totalVariantSites; - variantResults[7] = "Number of SNV missense variant containing peptides at " + _filterThreshold * 100 + "% group FDR: " + SNVmissenseCount; + variantResults[7] = "Number of SNV missense variant containing peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR: " + SNVmissenseCount; variantResults[8] = "Number of unique SNV missense variants: " + SNVmissenseSites; - variantResults[9] = "Number of MNV missense variant containing peptides at " + _filterThreshold * 100 + "% group FDR: " + MNVmissenseCount; + variantResults[9] = "Number of MNV missense variant containing peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR: " + MNVmissenseCount; variantResults[10] = "Number of unique MNV missense variants: " + MNVmissenseSites; - variantResults[11] = "Number of frameshift variant containing peptides at " + _filterThreshold * 100 + "% group FDR: " + frameshiftCount; + variantResults[11] = "Number of frameshift variant containing peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR: " + frameshiftCount; variantResults[12] = "Number of unique frameshift variants: " + frameshiftSites; - variantResults[13] = "Number of inframe insertion variant containing peptides at " + _filterThreshold * 100 + "% group FDR: " + insertionCount; + variantResults[13] = "Number of inframe insertion variant containing peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR: " + insertionCount; variantResults[14] = "Number of unique inframe insertion variants: " + insertionSites; - variantResults[15] = "Number of inframe deletion variant containing peptides at " + _filterThreshold * 100 + "% group FDR: " + deletionCount; + variantResults[15] = "Number of inframe deletion variant containing peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR: " + deletionCount; variantResults[16] = "Number of unique inframe deletion variants: " + deletionSites; - variantResults[17] = "Number of stop gain variant containing peptides at " + _filterThreshold * 100 + "% group FDR: " + stopGainCount; + variantResults[17] = "Number of stop gain variant containing peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR: " + stopGainCount; variantResults[18] = "Number of unique stop gain variants: " + stopGainSites; - variantResults[19] = "Number of stop loss variant containing peptides at " + _filterThreshold * 100 + "% group FDR: " + stopLossCount; + variantResults[19] = "Number of stop loss variant containing peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR: " + stopLossCount; variantResults[20] = "Number of unique stop loss variants: " + stopLossSites; - variantResults[21] = "Number of variant peptides at " + _filterThreshold * 100 + "% group FDR with unambiguous localized modifications: " + modifiedVariantPeptides.Count; - variantResults[22] = "Number of variant peptides at " + _filterThreshold * 100 + "% group FDR with unambiguous localized modifications at the variant sites : " + modifiedVariantSitePeptides.Count; + variantResults[21] = "Number of variant peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR with unambiguous localized modifications: " + modifiedVariantPeptides.Count; + variantResults[22] = "Number of variant peptides at " + fdrPsms.FilterThreshold * 100 + "% group FDR with unambiguous localized modifications at the variant sites : " + modifiedVariantSitePeptides.Count; string filePath = Path.Combine(Parameters.OutputFolder, "VariantAnalysisResultSummary.txt"); File.WriteAllLines(filePath, variantResults); @@ -1912,10 +1841,79 @@ private void WriteProteinGroupsToTsv(List proteinGroup } } } - FinishedWritingFile(filePath, nestedIds); } } + /// + /// This is a handy dictionary to keep track of the PSM, peptide and protein count results at the + /// "All" level and at the individual raw file level. + /// The keys are a tuple such as ("All", "PSMs") or ("RawFileName", "Peptides") + // The values are the results as a string + /// + private void ConstructResultsDictionary() + { + ResultsDictionary = new(); + + ResultsDictionary.Add(("All", "PSMs"),""); + ResultsDictionary.Add(("All", "Peptides"), ""); + + if (Parameters.CurrentRawFileList.Count > 1 && Parameters.SearchParameters.WriteIndividualFiles) + { + foreach (var rawFile in Parameters.CurrentRawFileList) + { + string fileNameWithoutExtension = Path.GetFileNameWithoutExtension(rawFile); + ResultsDictionary.Add((fileNameWithoutExtension, "PSMs"), ""); + ResultsDictionary.Add((fileNameWithoutExtension, "Peptides"), ""); + } + } + + if (Parameters.SearchParameters.DoParsimony) + { + ResultsDictionary.Add(("All", "Proteins"), ""); + if (Parameters.CurrentRawFileList.Count > 1 && Parameters.SearchParameters.WriteIndividualFiles) + { + foreach (var rawFile in Parameters.CurrentRawFileList) + { + string fileNameWithoutExtension = Path.GetFileNameWithoutExtension(rawFile); + ResultsDictionary.Add((fileNameWithoutExtension, "Proteins"), ""); + } + } + } + } + + private string AllResultsTotals() + { + StringBuilder sb = new(); + foreach (var key in ResultsDictionary.Keys) + { + if (key.Item1 == "All") + { + sb.AppendLine(ResultsDictionary[key]); + } + } + + var keys = ResultsDictionary.Keys.Where(k=>k.Item1 != "All").OrderBy(k=>k.Item1).ToList(); + if (keys.Any()) + { + sb.AppendLine(); + var item1 = keys.First().Item1; + foreach (var key in keys) + { + if (key.Item1 != item1) + { + sb.AppendLine(); + item1 = key.Item1; + } + sb.AppendLine(ResultsDictionary[key]); + } + } + return sb.ToString(); + } + + private void AddResultsTotalsToAllResultsTsv() + { + Parameters.SearchTaskResults.AddPsmPeptideProteinSummaryText(AllResultsTotals()); + } private void WritePeptideQuantificationResultsToTsv(FlashLfqResults flashLFQResults, string outputFolder, string fileName, List nestedIds) { diff --git a/MetaMorpheus/Test/FdrTest.cs b/MetaMorpheus/Test/FdrTest.cs index 310b011cf..12643d0e8 100644 --- a/MetaMorpheus/Test/FdrTest.cs +++ b/MetaMorpheus/Test/FdrTest.cs @@ -436,6 +436,7 @@ public static void TestPEP_peptideRemoval() psm.AddOrReplace(pwsm, 1, 1, true, new List(), 0); psm.AddOrReplace(pwsm, 1, 2, true, new List(), 0); psm.SetFdrValues(1, 0, 0, 1, 0, 0, 1, 0); + psm.PeptideFdrInfo = new FdrInfo(); List indiciesOfPeptidesToRemove = new List(); List<(int notch, PeptideWithSetModifications pwsm)> bestMatchingPeptidesToRemove = new List<(int notch, PeptideWithSetModifications pwsm)>(); diff --git a/MetaMorpheus/Test/MultiProteaseParsimonyTest.cs b/MetaMorpheus/Test/MultiProteaseParsimonyTest.cs index 2cea5a6f0..10a95401d 100644 --- a/MetaMorpheus/Test/MultiProteaseParsimonyTest.cs +++ b/MetaMorpheus/Test/MultiProteaseParsimonyTest.cs @@ -1030,17 +1030,34 @@ public static void MultiProteaseParsimony_TestingProteaseSpecificFDRCalculations List<(string fileName, CommonParameters fileSpecificParameters)> fsp = new List<(string fileName, CommonParameters fileSpecificParameters)> { ("filename", new CommonParameters()) }; new FdrAnalysisEngine(psms, 0, new CommonParameters(), fsp, new List()).Run(); - psms = psms.OrderByDescending(p => p.Score).ToList(); - + psms = psms.OrderByDescending(p => p).ToList(); + + //q-value is computed as targetCount / (decoyCount + targetCount) for each protease separately + //once a higher q-value is found, it is used for all subsequent PSMs with the same protease even if increasing number of targets would lower the q-value + + // Row t/d score protease targetCount decoyCount q-value + // 0 t 20 tryp 1 0 0 + // 1 t 19 gluC 1 0 0 + // 2 t 18 tryp 2 0 0 + // 3 t 17 gluC 2 0 0 + // 4 d 16 gluC 2 1 0.5 + // 5 t 15 gluC 3 1 0.5 + // 6 t 14 tryp 3 0 0 + // 7 d 13 tryp 3 1 0.333333333 + // 8 d 12 tryp 3 2 0.666666667 + // 9 t 11 tryp 4 2 0.666666667 + + Assert.AreEqual(0.00, Math.Round(psms[0].FdrInfo.QValue, 2)); Assert.AreEqual(0.00, Math.Round(psms[1].FdrInfo.QValue, 2)); Assert.AreEqual(0.00, Math.Round(psms[2].FdrInfo.QValue, 2)); Assert.AreEqual(0.00, Math.Round(psms[3].FdrInfo.QValue, 2)); - Assert.AreEqual(0.5, Math.Round(psms[4].FdrInfo.QValue, 2)); - Assert.AreEqual(0.33, Math.Round(psms[5].FdrInfo.QValue, 2)); + Assert.AreEqual(0.50, Math.Round(psms[4].FdrInfo.QValue, 2)); + Assert.AreEqual(0.50, Math.Round(psms[5].FdrInfo.QValue, 2)); Assert.AreEqual(0.00, Math.Round(psms[6].FdrInfo.QValue, 2)); Assert.AreEqual(0.33, Math.Round(psms[7].FdrInfo.QValue, 2)); Assert.AreEqual(0.67, Math.Round(psms[8].FdrInfo.QValue, 2)); - Assert.AreEqual(0.5, Math.Round(psms[9].FdrInfo.QValue, 2)); + Assert.AreEqual(0.67, Math.Round(psms[9].FdrInfo.QValue, 2)); + } } } \ No newline at end of file diff --git a/MetaMorpheus/Test/MyTaskTest.cs b/MetaMorpheus/Test/MyTaskTest.cs index be7cf4f8d..c62e3c84b 100644 --- a/MetaMorpheus/Test/MyTaskTest.cs +++ b/MetaMorpheus/Test/MyTaskTest.cs @@ -7,10 +7,8 @@ using Proteomics.ProteolyticDigestion; using System; using System.Collections.Generic; -using System.ComponentModel; using System.IO; using System.Linq; -using System.Reflection; using Omics.Modifications; using TaskLayer; using UsefulProteomicsDatabases; @@ -220,7 +218,8 @@ public static void MakeSureFdrDoesntSkip() digestionParams: new DigestionParams(minPeptideLength: 2), scoreCutoff: 1, deconvolutionIntensityRatio: 999, - deconvolutionMassTolerance: new PpmTolerance(50) + deconvolutionMassTolerance: new PpmTolerance(50), + maxThreadsToUsePerFile: 1 ), SearchParameters = new SearchParameters { @@ -248,27 +247,30 @@ public static void MakeSureFdrDoesntSkip() TestDataFile myMsDataFile = new(new List { targetGood }); - var ii = myMsDataFile.GetOneBasedScan(1).MassSpectrum.YArray.ToList(); + var ms1IntensityList = myMsDataFile.GetOneBasedScan(1).MassSpectrum.YArray.ToList(); - ii.Add(1); - ii.Add(1); - ii.Add(1); - ii.Add(1); + ms1IntensityList.Add(1); + ms1IntensityList.Add(1); + ms1IntensityList.Add(1); - var intensities = ii.ToArray(); + var newIntensityArray = ms1IntensityList.ToArray(); - var mm = myMsDataFile.GetOneBasedScan(1).MassSpectrum.XArray.ToList(); + var ms1MzList = myMsDataFile.GetOneBasedScan(1).MassSpectrum.XArray.ToList(); + Assert.AreEqual(6,ms1MzList.Count); - var hah = 104.35352; - mm.Add(hah); - mm.Add(hah + 1); - mm.Add(hah + 2); + List expectedMzList = new List() { 69.70, 70.03, 70.37, 104.04, 104.55, 105.05 }; + CollectionAssert.AreEquivalent(expectedMzList, ms1MzList.Select(m=>Math.Round(m,2)).ToList()); - var mz = mm.ToArray(); + var firstMz = 104.35352; //this mz is close to one of original mz values, but not exactly the same, it should not disrupt deconvolution + ms1MzList.Add(firstMz); + ms1MzList.Add(firstMz + 1); + ms1MzList.Add(firstMz + 2); - Array.Sort(mz, intensities); + var newMzArray = ms1MzList.ToArray(); - myMsDataFile.ReplaceFirstScanArrays(mz, intensities); + Array.Sort(newMzArray, newIntensityArray); + + myMsDataFile.ReplaceFirstMs1ScanArrays(newMzArray, newIntensityArray); Readers.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(myMsDataFile, mzmlName, false); string outputFolder = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestMakeSureFdrDoesntSkip"); @@ -276,6 +278,11 @@ public static void MakeSureFdrDoesntSkip() // RUN! var theStringResult = task.RunTask(outputFolder, new List { new DbForTask(xmlName, false) }, new List { mzmlName }, "taskId1").ToString(); + + + //There is one PSM with close peptide mass (0 ppm difference) and one PSM with large mass difference (>1000 ppm difference) + //Since this is an open search, both PSMs should be reported because they share the exact same MS2 scan + Assert.IsTrue(theStringResult.Contains("All target PSMs with q-value = 0.01: 1")); Directory.Delete(outputFolder, true); File.Delete(xmlName); @@ -425,7 +432,7 @@ public static void TestPeptideCount() { while ((line = file.ReadLine()) != null) { - if (line.Contains("All target peptides with q-value = 0.01 : 4")) + if (line.Contains("All target peptides with q-value = 0.01: 4")) { foundD = true; } @@ -491,7 +498,7 @@ public static void TestFileOutput() missingFiles = expectedFiles.Except(files); extraFiles = files.Except(expectedFiles); - Assert.That(files.SetEquals(expectedFiles)); + CollectionAssert.AreEquivalent(expectedFiles, files); files = new HashSet(Directory.GetFiles(Path.Combine(thisTaskOutputFolder, "Task Settings")).Select(v => Path.GetFileName(v))); expectedFiles = new HashSet { diff --git a/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs b/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs index d5ee76ea1..fb6fec18c 100644 --- a/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs +++ b/MetaMorpheus/Test/PostSearchAnalysisTaskTests.cs @@ -26,40 +26,29 @@ public static void AllResultsAndResultsTxtTests() string allResultsFile = Path.Combine(outputFolder, "allResults.txt"); string[] allResults = File.ReadAllLines(allResultsFile); - // TaGe_SA_A549_3_snip_2 is searched twice. First with two files being searched simultaneously, then with TaGe_SA_A549_3_snip_2 by itself - // This allows us to compare the file specific results produced by in the two file search to the output - // produced by searching the file by itself. The number of PSMs and Peptides observed should be the same - // for both single-file and multi-file searches. - // The number of protein groups will be different, because protein inference is performed once, using every peptide - // identified across all files. - int TaGe_SA_A549_3_snip_2ExpectedPsms = 214; - int TaGe_SA_A549_3_snip_2ExpectedPeptides = 174; - - - Assert.AreEqual("All target PSMs with q-value = 0.01: 428", allResults[12]); - Assert.AreEqual("All target peptides with q-value = 0.01 : 174", allResults[13]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", allResults[14]); - Assert.AreEqual("TaGe_SA_A549_3_snip target PSMs with q-value = 0.01: 214", allResults[18]); - Assert.AreEqual("Target protein groups within 1 % FDR in TaGe_SA_A549_3_snip: 165", allResults[24]); - Assert.AreEqual("TaGe_SA_A549_3_snip Target peptides with q-value = 0.01 : 174", allResults[26]); - - Assert.AreEqual("TaGe_SA_A549_3_snip_2 target PSMs with q-value = 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPsms, allResults[22]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 Target peptides with q-value = 0.01 : " + TaGe_SA_A549_3_snip_2ExpectedPeptides, allResults[28]); - Assert.AreEqual("Target protein groups within 1 % FDR in TaGe_SA_A549_3_snip_2: 165", allResults[25]); + // The new PEP calculation method improves things, so all these numbers are increasing as of (7/17/24) + // There is a discrepancy between the number of All target peptides and individual file target peptides, + // presumably due to the way protein inference is performed. + Assert.AreEqual("All target PSMs with q-value = 0.01: 428", allResults[10]); + Assert.AreEqual("All target peptides with q-value = 0.01: 174", allResults[11]); + Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", allResults[12]); + Assert.AreEqual("TaGe_SA_A549_3_snip - All target PSMs with q-value = 0.01: 214", allResults[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip - All target peptides with q-value = 0.01: 174", allResults[15]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 165", allResults[16]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target PSMs with q-value = 0.01: 214", allResults[18]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target peptides with q-value = 0.01: 174", allResults[19]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 165", allResults[20]); string resultsFile = Path.Combine(outputFolder, @"postSearchAnalysisTaskTestOutput\results.txt"); string[] results = File.ReadAllLines(resultsFile); - Assert.AreEqual("All target PSMs with q-value = 0.01: 428", results[7]); - Assert.AreEqual("All target peptides with q-value = 0.01 : 174", results[8]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", results[9]); - Assert.AreEqual("TaGe_SA_A549_3_snip target PSMs with q-value = 0.01: 214", results[13]); - Assert.AreEqual("Target protein groups within 1 % FDR in TaGe_SA_A549_3_snip: 165", results[19]); - Assert.AreEqual("TaGe_SA_A549_3_snip Target peptides with q-value = 0.01 : 174", results[21]); - Assert.AreEqual("Target protein groups within 1 % FDR in TaGe_SA_A549_3_snip_2: 165", results[20]); - - Assert.AreEqual("TaGe_SA_A549_3_snip_2 target PSMs with q-value = 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPsms, results[17]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 Target peptides with q-value = 0.01 : " + TaGe_SA_A549_3_snip_2ExpectedPeptides, results[23]); - + Assert.AreEqual("All target PSMs with q-value = 0.01: 428", results[5]); + Assert.AreEqual("All target peptides with q-value = 0.01: 174", results[6]); + Assert.AreEqual("TaGe_SA_A549_3_snip - All target PSMs with q-value = 0.01: 214", results[9]); + Assert.AreEqual("TaGe_SA_A549_3_snip - All target peptides with q-value = 0.01: 174", results[10]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 165", results[11]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target PSMs with q-value = 0.01: 214", results[13]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target peptides with q-value = 0.01: 174", results[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 165", results[15]); Directory.Delete(outputFolder, true); @@ -70,10 +59,18 @@ public static void AllResultsAndResultsTxtTests() outputFolder); engineToml.Run(); + // TaGe_SA_A549_3_snip_2 is searched twice. First with two files being searched simultaneously, then with TaGe_SA_A549_3_snip_2 by itself + // This allows us to compare the file specific results produced by in the two file search to the output + // produced by searching the file by itself. The number of PSMs and Peptides observed should be the same + // for both single-file and multi-file searches. + // The number of protein groups will be different, because protein inference is performed once, using every peptide + // identified across all files. + int TaGe_SA_A549_3_snip_2ExpectedPsms = 214; + int TaGe_SA_A549_3_snip_2ExpectedPeptides = 174; string[] singleFileResults = File.ReadAllLines(resultsFile); - Assert.AreEqual("All target PSMs with q-value = 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPsms, singleFileResults[7]); - Assert.AreEqual("All target peptides with q-value = 0.01 : " + TaGe_SA_A549_3_snip_2ExpectedPeptides, singleFileResults[8]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", singleFileResults[9]); + Assert.AreEqual("All target PSMs with q-value = 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPsms, singleFileResults[5]); + Assert.AreEqual("All target peptides with q-value = 0.01: " + TaGe_SA_A549_3_snip_2ExpectedPeptides, singleFileResults[6]); + Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", singleFileResults[7]); //Second test that AllResults and Results display correct numbers of peptides and psms with PEP q-value filter on myTomlPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestData\Task2-SearchTaskconfig.toml"); @@ -83,29 +80,29 @@ public static void AllResultsAndResultsTxtTests() allResultsFile = Path.Combine(outputFolder, "allResults.txt"); allResults = File.ReadAllLines(allResultsFile); - Assert.AreEqual("All target PSMs with pep q-value = 0.01: 485", allResults[12]); - Assert.AreEqual("All target peptides with pep q-value = 0.01 : 196", allResults[13]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", allResults[14]); - - Assert.AreEqual("TaGe_SA_A549_3_snip target PSMs with pep q-value = 0.01: 242", allResults[18]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 target PSMs with pep q-value = 0.01: 242", allResults[22]); - Assert.AreEqual("Target protein groups within 1 % FDR in TaGe_SA_A549_3_snip: 165", allResults[24]); - Assert.AreEqual("Target protein groups within 1 % FDR in TaGe_SA_A549_3_snip_2: 165", allResults[25]); - Assert.AreEqual("TaGe_SA_A549_3_snip Target peptides with pep q-value = 0.01 : 196", allResults[26]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 Target peptides with pep q-value = 0.01 : 195", allResults[28]); + Assert.AreEqual("All target PSMs with pep q-value = 0.01: 420", allResults[10]); + Assert.AreEqual("All target peptides with pep q-value = 0.01: 172", allResults[11]); + Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 155", allResults[12]); + Assert.AreEqual("TaGe_SA_A549_3_snip - All target PSMs with pep q-value = 0.01: 210", allResults[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip - All target peptides with pep q-value = 0.01: 172", allResults[15]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 155", allResults[16]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target PSMs with pep q-value = 0.01: 210", allResults[18]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target peptides with pep q-value = 0.01: 172", allResults[19]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 155", allResults[20]); + resultsFile = Path.Combine(outputFolder, @"postSearchAnalysisTaskTestOutput\results.txt"); results = File.ReadAllLines(resultsFile); - Assert.AreEqual("All target PSMs with pep q-value = 0.01: 485", results[7]); - Assert.AreEqual("All target peptides with pep q-value = 0.01 : 196", results[8]); - Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 165", results[9]); - Assert.AreEqual("TaGe_SA_A549_3_snip target PSMs with pep q-value = 0.01: 242", results[13]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 target PSMs with pep q-value = 0.01: 242", results[17]); - Assert.AreEqual("Target protein groups within 1 % FDR in TaGe_SA_A549_3_snip: 165", results[19]); - Assert.AreEqual("Target protein groups within 1 % FDR in TaGe_SA_A549_3_snip_2: 165", results[20]); - Assert.AreEqual("TaGe_SA_A549_3_snip Target peptides with pep q-value = 0.01 : 196", results[21]); - Assert.AreEqual("TaGe_SA_A549_3_snip_2 Target peptides with pep q-value = 0.01 : 195", results[23]); + Assert.AreEqual("All target PSMs with pep q-value = 0.01: 420", results[5]); + Assert.AreEqual("All target peptides with pep q-value = 0.01: 172", results[6]); + Assert.AreEqual("All target protein groups with q-value = 0.01 (1% FDR): 155", results[7]); + Assert.AreEqual("TaGe_SA_A549_3_snip - All target PSMs with pep q-value = 0.01: 210", results[9]); + Assert.AreEqual("TaGe_SA_A549_3_snip - All target peptides with pep q-value = 0.01: 172", results[10]); + Assert.AreEqual("TaGe_SA_A549_3_snip - Target protein groups within 1 % FDR: 155", results[11]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target PSMs with pep q-value = 0.01: 210", results[13]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - All target peptides with pep q-value = 0.01: 172", results[14]); + Assert.AreEqual("TaGe_SA_A549_3_snip_2 - Target protein groups within 1 % FDR: 155", results[15]); Directory.Delete(outputFolder, true); } diff --git a/MetaMorpheus/Test/SearchEngineTests.cs b/MetaMorpheus/Test/SearchEngineTests.cs index 3e439b4ec..f0e444f8b 100644 --- a/MetaMorpheus/Test/SearchEngineTests.cs +++ b/MetaMorpheus/Test/SearchEngineTests.cs @@ -98,7 +98,7 @@ public static void TestSearchEngineResultsPsmFromTsv() Assert.AreEqual("0", psm.Notch); Assert.AreEqual("Homo sapiens", psm.OrganismName); Assert.That(0, Is.EqualTo(psm.PEP).Within(1E-04)); - Assert.AreEqual(0, psm.PEP_QValue); + Assert.That(0.005, Is.EqualTo(psm.PEP_QValue).Within(1E-03)); Assert.AreEqual("full", psm.PeptideDescription); Assert.AreEqual("2125.92875", psm.PeptideMonoMass); Assert.AreEqual(3, psm.PrecursorCharge); @@ -144,7 +144,7 @@ public static void TestClassicSearchXcorrWithToml() List parsedPsms = PsmTsvReader.ReadTsv(psmFile, out var warnings); Assert.AreEqual(385, parsedPsms.Count); //total psm count - Assert.AreEqual(215, parsedPsms.Count(p => p.QValue < 0.01)); //psms with q-value < 0.01 as read from psmtsv + Assert.AreEqual(215, parsedPsms.Count(p => p.QValue < 0.01)); //psms with q-value < 0.01 as read from psmtsv, including decoys Assert.AreEqual(0, warnings.Count); int countFromResultsTxt = Convert.ToInt32(File.ReadAllLines(Path.Combine(outputFolder, @"SearchTOML\results.txt")).ToList().FirstOrDefault(l=>l.Contains("All target")).Split(":")[1].Trim()); diff --git a/MetaMorpheus/Test/SearchTaskTest.cs b/MetaMorpheus/Test/SearchTaskTest.cs index 9a88b00a9..32e03d4e5 100644 --- a/MetaMorpheus/Test/SearchTaskTest.cs +++ b/MetaMorpheus/Test/SearchTaskTest.cs @@ -13,6 +13,7 @@ using Omics.Digestion; using Omics.Modifications; using TaskLayer; +using Org.BouncyCastle.Asn1.X509; namespace Test { @@ -156,8 +157,6 @@ public static void SemiSpecificTest() digestionParams: new DigestionParams(searchModeType: CleavageSpecificity.Semi, fragmentationTerminus: fragTerm)) }; - DbForTask db = new DbForTask(myDatabase, false); - List<(string, MetaMorpheusTask)> taskList = new List<(string, MetaMorpheusTask)> { ("TestSemiSpecific", searchTask) }; var engine = new EverythingRunnerEngine(taskList, new List { myFile }, new List { new DbForTask(myDatabase, false) }, outputFolder); @@ -369,24 +368,27 @@ public static void PeptideFDRTest() { columns = lineline.ToList(); } - - // since each PSM has a duplicate, these counts will be 1,3,5,7, etc. if peptide FDR isn't calculated - // if peptide FDR is calculated, they will be 1,2,3,4, etc. as expected - else if (lineline[columns.IndexOf("Decoy/Contaminant/Target")] == "D") - { - Assert.AreEqual(++cumDecoys, int.Parse(lineline[columns.IndexOf("Cumulative Decoy")])); - finalQValue = double.Parse(lineline[columns.IndexOf("QValue")], CultureInfo.InvariantCulture); - } else { - Assert.AreEqual(++cumTargets, int.Parse(lineline[columns.IndexOf("Cumulative Target")])); - finalQValue = double.Parse(lineline[columns.IndexOf("QValue")], CultureInfo.InvariantCulture); + // since each PSM has a duplicate, these counts will be 1,3,5,7, etc. if peptide FDR isn't calculated + // if peptide FDR is calculated, they will be 1,2,3,4, etc. as expected + if (lineline[columns.IndexOf("Decoy/Contaminant/Target")] == "D") + { + Assert.AreEqual(++cumDecoys, int.Parse(lineline[columns.IndexOf("Cumulative Decoy")])); + } + else + { + Assert.AreEqual(++cumTargets, int.Parse(lineline[columns.IndexOf("Cumulative Target")])); + } + + finalQValue = Math.Max(finalQValue, (double)cumDecoys / (double)cumTargets); } + } // test that the final q-value follows the (target / decoy) formula // intermediate q-values no longer always follow this formula, so I'm not testing them here - Assert.That((double)cumDecoys / (double)cumTargets, Is.EqualTo(finalQValue).Within(.0005)); + Assert.That(0.5, Is.EqualTo(finalQValue).Within(.0005)); Directory.Delete(folderPath, true); } @@ -611,8 +613,8 @@ public static void TestPepFilteringFewerThan100Psms() string resultsFile = Path.Combine(pepTaskFolder, "results.txt"); string[] results = File.ReadAllLines(resultsFile); - Assert.AreEqual("PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value.", results[7]); - Assert.AreEqual("All target PSMs with q-value = 0.02: 84", results[8]); + Assert.AreEqual("PEP could not be calculated due to an insufficient number of PSMs. Results were filtered by q-value.", results[6]); + Assert.AreEqual("All target PSMs with q-value = 1: 84", results[7]); // clean up Directory.Delete(folderPath, true); diff --git a/MetaMorpheus/Test/SilacTest.cs b/MetaMorpheus/Test/SilacTest.cs index e6b4c04ed..a46cbf186 100644 --- a/MetaMorpheus/Test/SilacTest.cs +++ b/MetaMorpheus/Test/SilacTest.cs @@ -459,7 +459,8 @@ public static void TestSilacTurnover() output = File.ReadAllLines(TestContext.CurrentContext.TestDirectory + @"/TestSilac/AllQuantifiedProteinGroups.tsv"); //test sequence coverage and output worked from multiple labels - Assert.IsTrue(output[1].Contains("PEPTK(+8.014)IDEK(+8.014)|PEPEPEPTK(+1.994)")); + // Both labels should be included, but the order doesnt matter + Assert.IsTrue(output[1].Contains("PEPTK(+8.014)IDEK(+8.014)|PEPEPEPTK(+1.994)") | output[1].Contains("PEPEPEPTK(+1.994)|PEPTK(+8.014)IDEK(+8.014)")); Assert.IsTrue(output[1].Contains("PEPEPEPTKidekPEPTKIDEKa\tPEPEPEPTKidekPEPTKIDEKa\tPEPEPEPTKidekPEPTKIDEKa")); //try modern search (testing indexing) diff --git a/MetaMorpheus/Test/SpectralRecoveryTest.cs b/MetaMorpheus/Test/SpectralRecoveryTest.cs index 8380a4d38..da8b65bb0 100644 --- a/MetaMorpheus/Test/SpectralRecoveryTest.cs +++ b/MetaMorpheus/Test/SpectralRecoveryTest.cs @@ -56,14 +56,16 @@ public void SpectralRecoveryTestSetup() string databasePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", @"SpectralRecoveryTest\HumanFastaSlice.fasta"); proteinList = ProteinDbLoader.LoadProteinFasta(databasePath, true, DecoyType.Reverse, false, out List errors) .Where(protein => protein.AppliedSequenceVariations != null).ToList(); + CommonParameters commonParameters = new CommonParameters(); + foreach (PsmFromTsv readPsm in tsvPsms.Where(psm => !psm.FullSequence.Contains('['))) // Modifications break the parser { string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", "SpectralRecoveryTest", readPsm.FileNameWithoutExtension + ".mzML"); - MsDataScan scan = myFileManager.LoadFile(filePath, new CommonParameters()).GetOneBasedScan(readPsm.Ms2ScanNumber); + MsDataScan scan = myFileManager.LoadFile(filePath, commonParameters).GetOneBasedScan(readPsm.Ms2ScanNumber); Ms2ScanWithSpecificMass ms2Scan = new Ms2ScanWithSpecificMass(scan, readPsm.PrecursorMz, readPsm.PrecursorCharge, - filePath, new CommonParameters()); + filePath, commonParameters); Protein protein = proteinList.First(protein => protein.Accession == readPsm.ProteinAccession); //string[] startAndEndResidues = readPsm.StartAndEndResiduesInProtein.Split(" "); @@ -104,8 +106,8 @@ public void SpectralRecoveryTestSetup() WriteMzId = false, MassDiffAcceptorType = MassDiffAcceptorType.ThreeMM, WriteHighQValuePsms = true - }, - CommonParameters = new CommonParameters() + }, + CommonParameters = new CommonParameters(qValueCutoffForPepCalculation: 0.01) }; searchTaskResults = searchTask.RunTask(outputFolder, databaseList, rawSlices, "name"); @@ -136,10 +138,10 @@ public void SpectralRecoveryTestSetup() QuantifyPpmTol = 25 } }, - CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect), + CommonParameters = new CommonParameters(dissociationType: DissociationType.Autodetect, qValueCutoffForPepCalculation: 0.01), FileSpecificParameters = new List<(string FileName, CommonParameters Parameters)> { - (rawSlices[0], new CommonParameters()), - (rawSlices[1], new CommonParameters()) + (rawSlices[0], new CommonParameters(qValueCutoffForPepCalculation: 0.01)), + (rawSlices[1], new CommonParameters(qValueCutoffForPepCalculation: 0.01)) } }; @@ -150,7 +152,6 @@ public void SpectralRecoveryTestSetup() [Test] public static void SpectralRecoveryPostSearchAnalysisTest() { - List warnings; string mbrAnalysisPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"TestSpectralRecoveryOutput\SpectralRecovery\RecoveredSpectra.psmtsv"); string expectedHitsPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "TestData", @"SpectralRecoveryTest\ExpectedMBRHits.psmtsv"); @@ -162,9 +163,11 @@ public static void SpectralRecoveryPostSearchAnalysisTest() List matches02ng = mbrPsms.Where(p => p.FileNameWithoutExtension == "K13_02ng_1min_frac1").ToList(); List expectedMatches = mbrPsms.Select(p => p.BaseSeq).Intersect(expectedMbrPsms.Select(p => p.BaseSeq).ToList()).ToList(); - Assert.That(matches2ng.Count >= 2); - Assert.That(matches02ng.Count >= 8); - Assert.That(expectedMatches.Count >= 3); // FlashLFQ doesn't find all 6 expected peaks, only 3. MbrAnalysis finds these three peaks + // Changing Q-value calculation methods results in more PSMs being discovered, and so fewer spectra are available to be "recovered" + // (as they were identified in the orignal search) + Assert.That(matches2ng.Count >= 3); + Assert.That(matches02ng.Count >= 10); + Assert.That(expectedMatches.Count >= 2); // FlashLFQ doesn't find all 6 expected peaks, only 3. MbrAnalysis finds these three peaks //TODO: Add test for recovering fdrInfo from original. Currently, PsmTsvReader doesn't support the new columns, so it's hard to test } diff --git a/MetaMorpheus/Test/TestDataFile.cs b/MetaMorpheus/Test/TestDataFile.cs index 4757672eb..3289041d2 100644 --- a/MetaMorpheus/Test/TestDataFile.cs +++ b/MetaMorpheus/Test/TestDataFile.cs @@ -435,7 +435,7 @@ public string Name } } - public void ReplaceFirstScanArrays(double[] mz, double[] intensities) + public void ReplaceFirstMs1ScanArrays(double[] mz, double[] intensities) { MzSpectrum massSpectrum = new MzSpectrum(mz, intensities, false); Scans[0] = new MsDataScan(massSpectrum, Scans[0].OneBasedScanNumber, Scans[0].MsnOrder, Scans[0].IsCentroid, Scans[0].Polarity, Scans[0].RetentionTime, Scans[0].ScanWindowRange, Scans[0].ScanFilter, Scans[0].MzAnalyzer, massSpectrum.SumOfAllY, Scans[0].InjectionTime, null, Scans[0].NativeId); diff --git a/MetaMorpheus/Test/TestPsm.cs b/MetaMorpheus/Test/TestPsm.cs index 7a0f63966..77b521eb5 100644 --- a/MetaMorpheus/Test/TestPsm.cs +++ b/MetaMorpheus/Test/TestPsm.cs @@ -436,7 +436,7 @@ public static void TestPsmCount2() List results = File.ReadAllLines(Path.Combine(outputFolder, @"results.txt")).ToList(); - string peptideCountFromResultsString = results.Where(r => r.Contains("All target peptides with q-value = 0.01 : ")).FirstOrDefault(); + string peptideCountFromResultsString = results.Where(r => r.Contains("All target peptides with q-value = 0.01: ")).FirstOrDefault(); double peptideCountFromResults = Convert.ToDouble(peptideCountFromResultsString.Split(':')[1].ToString()); Assert.AreEqual(allPeptidesQvalueBelowCutoff, peptideCountFromResults); Directory.Delete(outputFolder, true);