From 4fe3343bece3733f9b85a31b3f49a104a9a752b0 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Oct 2023 14:11:11 -0500 Subject: [PATCH 01/55] Predict Retention Time function added --- mzLib/FlashLFQ/FlashLfqEngine.cs | 122 ++++++++++++++++++++++++++++--- 1 file changed, 113 insertions(+), 9 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 3ab8dd5a1..1fecba68d 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -469,6 +469,108 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) _results.Peaks[fileInfo].AddRange(chromatographicPeaks.ToList()); } + internal (double predictedRt, double range, double? rtSd, double? rtInterquartileRange)? PredictRetentionTime( + RetentionTimeCalibDataPoint[] rtCalibrationCurve, + ChromatographicPeak donorPeak, + SpectraFileInfo acceptorFile, SpectraFileInfo donorFile, + bool acceptorSampleIsFractionated, bool donorSampleIsFractionated) + { + + var nearbyCalibrationPoints = new List(); + var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); + + nearbyCalibrationPoints.Clear(); + + // only compare +- 1 fraction + if (acceptorSampleIsFractionated && donorSampleIsFractionated) + { + int acceptorFractionNumber = acceptorFile.Fraction; + int donorFractionNumber = donorFile.Fraction; + + if (Math.Abs(acceptorFractionNumber - donorFractionNumber) > 1) + { + return null; + } + } + + Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); + + // binary search for this donor peak in the retention time calibration spline + RetentionTimeCalibDataPoint testPoint = new RetentionTimeCalibDataPoint(donorPeak, null); + int index = Array.BinarySearch(rtCalibrationCurve, testPoint); + + if (index < 0) + { + index = ~index; + } + if (index >= rtCalibrationCurve.Length && index >= 1) + { + index = rtCalibrationCurve.Length - 1; + } + + // gather nearby data points + for (int r = index; r < rtCalibrationCurve.Length; r++) + { + double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; + + if (Math.Abs(rtDiff) < 0.5) + { + nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); + } + else + { + break; + } + } + + for (int r = index - 1; r >= 0; r--) + { + double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; + + if (Math.Abs(rtDiff) < 0.5) + { + nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); + } + else + { + break; + } + } + + if (!nearbyCalibrationPoints.Any()) + { + return null; + } + + // calculate difference between acceptor and donor RTs for these RT region + List rtDiffs = nearbyCalibrationPoints + .Select(p => p.AcceptorFilePeak.Apex.IndexedPeak.RetentionTime - p.DonorFilePeak.Apex.IndexedPeak.RetentionTime) + .ToList(); + + // figure out the range of RT differences between the files that are "reasonable", centered around the median difference + double median = rtDiffs.Median(); + + // default range (if only 1 datapoint, or SD is 0, range is very high, etc) + double rtRange = MbrRtWindow; + double? rtStdDev = null; + double? rtInterquartileRange = null; + + if (nearbyCalibrationPoints.Count < 6 && nearbyCalibrationPoints.Count > 1 && rtDiffs.StandardDeviation() > 0) + { + rtStdDev = rtDiffs.StandardDeviation(); + rtRange = (double)rtStdDev * 6.0; // Multiplication inherited from legacy code, unsure of reason for 6 + } + else if (nearbyCalibrationPoints.Count >= 6 && rtDiffs.InterquartileRange() > 0) + { + rtInterquartileRange = rtDiffs.InterquartileRange(); + rtRange = (double)rtInterquartileRange * 4.5; // Multiplication inherited from legacy code, unsure of reason for 4.5 + } + + rtRange = Math.Min(rtRange, MbrRtWindow); + + return (predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, range: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); + } + #region mbr /// /// This method maps identified peaks from other chromatographic runs ("donors") onto /// the defined chromatographic run ("acceptor"). The goal is to reduce the number of missing @@ -569,7 +671,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) var donorFileLogIntensities = idDonorPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList(); double medianDonorLogIntensity = donorFileLogIntensities.Median(); - + #endregion // generate RT calibration curve RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile); @@ -615,6 +717,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } + Normal foldChangeDistribution = null; if (listOfFoldChangesBetweenTheFiles.Count > 100) { @@ -699,10 +802,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) .Select(p => p.AcceptorFilePeak.Apex.IndexedPeak.RetentionTime - p.DonorFilePeak.Apex.IndexedPeak.RetentionTime) .ToList(); - //double medianIntensityDiff = nearbyCalibrationPoints - // .Select(p => Math.Log(p.AcceptorFilePeak.Intensity, 2) - Math.Log(p.DonorFilePeak.Intensity, 2)) - // .Median(); - // figure out the range of RT differences between the files that are "reasonable", centered around the median difference double median = rtDiffs.Median(); @@ -724,12 +823,17 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) rtRange = Math.Min(rtRange, MbrRtWindow); + // TODO: Add a toggle that set rtRange to be maximum width + var predictionResults = PredictRetentionTime(rtCalibrationCurve, idDonorPeaks[i], idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + if (predictionResults == null) continue; + (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo = ((double, double, double?, double?))predictionResults; + // this is the RT in the acceptor file to look around to find this analyte - double acceptorFileRtHypothesis = donorPeak.Apex.IndexedPeak.RetentionTime + median; - double lowerRangeRtHypothesis = acceptorFileRtHypothesis - (rtRange / 2.0); - double upperRangeRtHypothesis = acceptorFileRtHypothesis + (rtRange / 2.0); + double acceptorFileRtHypothesis = rtInfo.predictedRt; + double lowerRangeRtHypothesis = acceptorFileRtHypothesis - (rtInfo.range / 2.0); + double upperRangeRtHypothesis = acceptorFileRtHypothesis + (rtInfo.range / 2.0); - Normal rtScoringDistribution = new Normal(acceptorFileRtHypothesis, rtRange / 6); + Normal rtScoringDistribution = new Normal(acceptorFileRtHypothesis, rtInfo.range / 6); // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; From 742ac30728d7444b0e21699413a939c3a18d5736 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 23 Oct 2023 14:19:03 -0500 Subject: [PATCH 02/55] breaking up mbr function, cont'd --- mzLib/FlashLFQ/FlashLfqEngine.cs | 319 +++++++++++-------------------- 1 file changed, 115 insertions(+), 204 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 1fecba68d..cbf717be2 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -590,6 +590,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) var acceptorFileIdentifiedPeaks = _results.Peaks[idAcceptorFile]; var apexToAcceptorFilePeak = new Dictionary(); + // Ppm distribution List ppmErrors = new List(); foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null)) { @@ -600,20 +601,14 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) ppmErrors.Add(peak.MassError); } - if (ppmErrors.Count < 3) { return; } - double ppmSpread = ppmErrors.Count > 30 ? ppmErrors.InterquartileRange() / 1.36 : ppmErrors.StandardDeviation(); - Normal ppmDistribution = new Normal(ppmErrors.Median(), ppmSpread); - double filespecificMbrPpmTolerance = Math.Min(Math.Abs(ppmErrors.Median()) + ppmSpread * 4, MbrPpmTolerance); - - // match between runs PPM tolerance - Tolerance mbrTol = new PpmTolerance(filespecificMbrPpmTolerance); + Tolerance mbrTol = new PpmTolerance(filespecificMbrPpmTolerance); // match between runs PPM tolerance // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier _peakIndexingEngine.DeserializeIndex(idAcceptorFile); @@ -621,7 +616,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // these are the analytes already identified in this run. we don't need to try to match them from other runs var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks.Where(p => p.IsotopicEnvelopes.Any()) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); - var acceptorFileLogIntensities = acceptorFileIdentifiedPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList(); double medianAcceptorLogIntensity = acceptorFileLogIntensities.Median(); Normal intensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36); @@ -672,58 +666,23 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) var donorFileLogIntensities = idDonorPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList(); double medianDonorLogIntensity = donorFileLogIntensities.Median(); #endregion - // generate RT calibration curve - RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile); + // Find the difference in peptide intensities between donor and acceptor files // this intensity score creates a conservative bias in MBR List listOfFoldChangesBetweenTheFiles = new List(); - if (_spectraFileInfo.Select(p => p.Condition).Distinct().Count() > 1 && idDonorFile.Condition != idAcceptorFile.Condition) { - var acceptorFileBestMsmsPeaks = new Dictionary(); - - IEnumerable acceptorPeaks = _results - .Peaks[idAcceptorFile] - .Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - - // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in acceptorPeaks) - { - if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > acceptorPeak.Intensity) - { - acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; - } - } - else - { - acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); - } - } - - foreach (var donorPeak in idDonorPeaks) - { - double donorPeakIntensity = donorPeak.Intensity; - if (acceptorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out var acceptorPeak)) - { - double acceptorPeakIntensity = acceptorPeak.Intensity; - - double intensityLogFoldChange = Math.Log(acceptorPeakIntensity, 2) - Math.Log(donorPeakIntensity, 2); - - listOfFoldChangesBetweenTheFiles.Add(intensityLogFoldChange); - } - } + CalculateFoldChangeBetweenFiles(idAcceptorFile, idDonorPeaks, listOfFoldChangesBetweenTheFiles); } + Normal foldChangeDistribution = listOfFoldChangesBetweenTheFiles.Count > 100 + ? new Normal(listOfFoldChangesBetweenTheFiles.Median(), listOfFoldChangesBetweenTheFiles.StandardDeviation()) + : null; + // generate RT calibration curve + RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile); - Normal foldChangeDistribution = null; - if (listOfFoldChangesBetweenTheFiles.Count > 100) - { - new Normal(listOfFoldChangesBetweenTheFiles.Median(), listOfFoldChangesBetweenTheFiles.StandardDeviation()); - } - + // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, idDonorPeaks.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (range, loopState) => @@ -733,98 +692,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) for (int i = range.Item1; i < range.Item2; i++) { - nearbyCalibrationPoints.Clear(); - - // only compare +- 1 fraction - if (acceptorSampleIsFractionated && donorSampleIsFractionated) - { - int acceptorFractionNumber = idAcceptorFile.Fraction; - int donorFractionNumber = idDonorFile.Fraction; - - if (Math.Abs(acceptorFractionNumber - donorFractionNumber) > 1) - { - continue; - } - } - ChromatographicPeak donorPeak = idDonorPeaks[i]; - Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - - // binary search for this donor peak in the retention time calibration spline - RetentionTimeCalibDataPoint testPoint = new RetentionTimeCalibDataPoint(donorPeak, null); - int index = Array.BinarySearch(rtCalibrationCurve, testPoint); - - if (index < 0) - { - index = ~index; - } - if (index >= rtCalibrationCurve.Length && index >= 1) - { - index = rtCalibrationCurve.Length - 1; - } - - // gather nearby data points - for (int r = index; r < rtCalibrationCurve.Length; r++) - { - double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; - - if (Math.Abs(rtDiff) < 0.5) - { - nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); - } - else - { - break; - } - } - - for (int r = index - 1; r >= 0; r--) - { - double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; - - if (Math.Abs(rtDiff) < 0.5) - { - nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); - } - else - { - break; - } - } - - if (!nearbyCalibrationPoints.Any()) - { - continue; - } - - // calculate difference between acceptor and donor RTs for these RT region - List rtDiffs = nearbyCalibrationPoints - .Select(p => p.AcceptorFilePeak.Apex.IndexedPeak.RetentionTime - p.DonorFilePeak.Apex.IndexedPeak.RetentionTime) - .ToList(); - - // figure out the range of RT differences between the files that are "reasonable", centered around the median difference - double median = rtDiffs.Median(); - - // default range (if only 1 datapoint, or SD is 0, range is very high, etc) - double rtRange = MbrRtWindow; - double? rtStdDev = null; - double? rtInterquartileRange = null; - - if (nearbyCalibrationPoints.Count < 6 && nearbyCalibrationPoints.Count > 1 && rtDiffs.StandardDeviation() > 0) - { - rtStdDev = rtDiffs.StandardDeviation(); - rtRange = (double)rtStdDev * 6.0; // Multiplication inherited from legacy code, unsure of reason for 6 - } - else if (nearbyCalibrationPoints.Count >= 6 && rtDiffs.InterquartileRange() > 0) - { - rtInterquartileRange = rtDiffs.InterquartileRange(); - rtRange = (double)rtInterquartileRange * 4.5; // Multiplication inherited from legacy code, unsure of reason for 4.5 - } - - rtRange = Math.Min(rtRange, MbrRtWindow); - // TODO: Add a toggle that set rtRange to be maximum width - var predictionResults = PredictRetentionTime(rtCalibrationCurve, idDonorPeaks[i], idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + var predictionResults = PredictRetentionTime(rtCalibrationCurve, donorPeak, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (predictionResults == null) continue; (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo = ((double, double, double?, double?))predictionResults; @@ -832,7 +702,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) double acceptorFileRtHypothesis = rtInfo.predictedRt; double lowerRangeRtHypothesis = acceptorFileRtHypothesis - (rtInfo.range / 2.0); double upperRangeRtHypothesis = acceptorFileRtHypothesis + (rtInfo.range / 2.0); - Normal rtScoringDistribution = new Normal(acceptorFileRtHypothesis, rtInfo.range / 6); // get the MS1 scan info for this region so we can look up indexed peaks @@ -843,10 +712,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { Ms1ScanInfo scan = ms1ScanInfos[j]; if (scan.RetentionTime <= lowerRangeRtHypothesis) - { start = scan; - } - if (scan.RetentionTime >= upperRangeRtHypothesis) { end = scan; @@ -862,6 +728,10 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) chargesToMatch.Add(donorPeak.Apex.ChargeState); } + Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); + + // TODO: For decoys, need to increase ppm tolerance until something is found or a maximum is reached + // Decoys, just do one charge state foreach (int z in chargesToMatch) { List chargeXic = new List(); @@ -869,17 +739,11 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) { IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(donorIdentification.PeakfindingMass, j, mbrTol, z); - if (peak != null) - { chargeXic.Add(peak); - } } - - if (!chargeXic.Any()) - { + if (!chargeXic.Any()) continue; - } List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z); @@ -887,57 +751,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // remove the clustered isotopic envelopes from the list of seeds after each iteration while (chargeEnvelopes.Any()) { - var acceptorPeak = new ChromatographicPeak(donorIdentification, true, idAcceptorFile); - IsotopicEnvelope seedEnv = chargeEnvelopes.First(); - - var xic = Peakfind(seedEnv.IndexedPeak.RetentionTime, donorIdentification.PeakfindingMass, z, idAcceptorFile, mbrTol); - List bestChargeEnvelopes = GetIsotopicEnvelopes(xic, donorIdentification, z); - acceptorPeak.IsotopicEnvelopes.AddRange(bestChargeEnvelopes); - acceptorPeak.CalculateIntensityForThisFeature(Integrate); - acceptorPeak.SetRtWindow(acceptorFileRtHypothesis, rtStdDev, rtInterquartileRange); - - CutPeak(acceptorPeak, seedEnv.IndexedPeak.RetentionTime); - - var claimedPeaks = new HashSet(acceptorPeak.IsotopicEnvelopes.Select(p => p.IndexedPeak)); - claimedPeaks.Add(seedEnv.IndexedPeak); // prevents infinite loops - - chargeEnvelopes.RemoveAll(p => claimedPeaks.Contains(p.IndexedPeak)); - - // peak has already been identified by MSMS - skip it - if (apexToAcceptorFilePeak.ContainsKey(seedEnv.IndexedPeak)) - { + ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, apexToAcceptorFilePeak, ppmDistribution, mbrTol, medianAcceptorLogIntensity, intensityDistribution, foldChangeDistribution, rtInfo, acceptorFileRtHypothesis, rtScoringDistribution, donorIdentification, z, chargeEnvelopes); + if (acceptorPeak == null) continue; - } - - // score the peak hypothesis - double rtScore = rtScoringDistribution.Density(acceptorPeak.Apex.IndexedPeak.RetentionTime); - double ppmScore = ppmDistribution.Density(acceptorPeak.MassError); - double intensityScore = 0; - - double logIntensity = Math.Log(acceptorPeak.Intensity, 2); - - if (foldChangeDistribution != null) - { - intensityScore = foldChangeDistribution.Density(logIntensity); - } - else - { - if (logIntensity < medianAcceptorLogIntensity) - { - intensityScore = intensityDistribution.Density(logIntensity); - } - else - { - intensityScore = intensityDistribution.Density(intensityDistribution.Mode); - } - } - - rtScore = Math.Log(rtScore + Math.Sqrt(Math.Pow(rtScore, 2) + 1)); - ppmScore = Math.Log(ppmScore + Math.Sqrt(Math.Pow(ppmScore, 2) + 1)); - intensityScore = Math.Log(intensityScore + Math.Sqrt(Math.Pow(intensityScore, 2) + 1)); - - // larger scores are better - acceptorPeak.MbrScore = (rtScore + ppmScore + intensityScore) * (1 - donorIdentification.PosteriorErrorProbability); // save the peak hypothesis // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak @@ -1014,6 +830,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) }); } + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -1046,14 +863,108 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } - //here is where we get the cosine similarity. - _results.Peaks[idAcceptorFile].Add(best); } RunErrorChecking(idAcceptorFile); } + private void CalculateFoldChangeBetweenFiles(SpectraFileInfo idAcceptorFile, List idDonorPeaks, List listOfFoldChangesBetweenTheFiles) + { + var acceptorFileBestMsmsPeaks = new Dictionary(); + + IEnumerable acceptorPeaks = _results + .Peaks[idAcceptorFile] + .Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); + + // get the best (most intense) peak for each peptide in the acceptor file + foreach (ChromatographicPeak acceptorPeak in acceptorPeaks) + { + if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) + { + if (currentBestPeak.Intensity > acceptorPeak.Intensity) + { + acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; + } + } + else + { + acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); + } + } + + foreach (var donorPeak in idDonorPeaks) + { + double donorPeakIntensity = donorPeak.Intensity; + if (acceptorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out var acceptorPeak)) + { + double acceptorPeakIntensity = acceptorPeak.Intensity; + + double intensityLogFoldChange = Math.Log(acceptorPeakIntensity, 2) - Math.Log(donorPeakIntensity, 2); + + listOfFoldChangesBetweenTheFiles.Add(intensityLogFoldChange); + } + } + } + + // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. + // remove the clustered isotopic envelopes from the list of seeds after each iteration + internal ChromatographicPeak FindAcceptorPeak(SpectraFileInfo idAcceptorFile, Dictionary apexToAcceptorFilePeak, Normal ppmDistribution, Tolerance mbrTol, double medianAcceptorLogIntensity, Normal intensityDistribution, Normal foldChangeDistribution, (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo, double acceptorFileRtHypothesis, Normal rtScoringDistribution, Identification donorIdentification, int z, List chargeEnvelopes) + { + var acceptorPeak = new ChromatographicPeak(donorIdentification, true, idAcceptorFile); + IsotopicEnvelope seedEnv = chargeEnvelopes.First(); + + var xic = Peakfind(seedEnv.IndexedPeak.RetentionTime, donorIdentification.PeakfindingMass, z, idAcceptorFile, mbrTol); + List bestChargeEnvelopes = GetIsotopicEnvelopes(xic, donorIdentification, z); + acceptorPeak.IsotopicEnvelopes.AddRange(bestChargeEnvelopes); + acceptorPeak.CalculateIntensityForThisFeature(Integrate); + acceptorPeak.SetRtWindow(acceptorFileRtHypothesis, rtInfo.rtSd, rtInfo.rtInterquartileRange); + + CutPeak(acceptorPeak, seedEnv.IndexedPeak.RetentionTime); + + var claimedPeaks = new HashSet(acceptorPeak.IsotopicEnvelopes.Select(p => p.IndexedPeak)); + claimedPeaks.Add(seedEnv.IndexedPeak); // prevents infinite loops + + chargeEnvelopes.RemoveAll(p => claimedPeaks.Contains(p.IndexedPeak)); + + // peak has already been identified by MSMS - skip it + if (apexToAcceptorFilePeak.ContainsKey(seedEnv.IndexedPeak)) + { + return null; + } + + // score the peak hypothesis + double rtScore = rtScoringDistribution.Density(acceptorPeak.Apex.IndexedPeak.RetentionTime); + double ppmScore = ppmDistribution.Density(acceptorPeak.MassError); + double intensityScore = 0; + + double logIntensity = Math.Log(acceptorPeak.Intensity, 2); + + if (foldChangeDistribution != null) + { + intensityScore = foldChangeDistribution.Density(logIntensity); + } + else + { + if (logIntensity < medianAcceptorLogIntensity) + { + intensityScore = intensityDistribution.Density(logIntensity); + } + else + { + intensityScore = intensityDistribution.Density(intensityDistribution.Mode); + } + } + + rtScore = Math.Log(rtScore + Math.Sqrt(Math.Pow(rtScore, 2) + 1)); + ppmScore = Math.Log(ppmScore + Math.Sqrt(Math.Pow(ppmScore, 2) + 1)); + intensityScore = Math.Log(intensityScore + Math.Sqrt(Math.Pow(intensityScore, 2) + 1)); + + // larger scores are better + acceptorPeak.MbrScore = (rtScore + ppmScore + intensityScore) * (1 - donorIdentification.PosteriorErrorProbability); + return acceptorPeak; + } + /// /// Used by the match-between-runs algorithm to determine systematic retention time drifts between /// chromatographic runs. From e51a740ce9c04350dbfcb08f80b96b768dc76d6a Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 31 Oct 2023 14:24:02 -0500 Subject: [PATCH 03/55] Decoy search is working --- mzLib/FlashLFQ/FlashLfqEngine.cs | 304 +++++++++++++++++++++-- mzLib/FlashLFQ/IsotopicEnvelope.cs | 9 + mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 249 +++++++++++++++++++ 3 files changed, 545 insertions(+), 17 deletions(-) create mode 100644 mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index cbf717be2..bf496d369 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -1,16 +1,22 @@ using Chemistry; using MassSpectrometry; using MathNet.Numerics.Distributions; +using MathNet.Numerics.LinearAlgebra.Factorization; using MathNet.Numerics.Statistics; using MzLibUtil; using Proteomics.AminoAcidPolymer; using System; +using System.Collections; using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; using System.Linq; +using System.Runtime; using System.Threading.Tasks; using UsefulProteomicsDatabases; +using System.Runtime.CompilerServices; + +[assembly: InternalsVisibleTo("TestFlashLFQ")] namespace FlashLFQ { @@ -62,8 +68,8 @@ public class FlashLfqEngine private Dictionary> _modifiedSequenceToIsotopicDistribution; private List _chargeStates; private FlashLfqResults _results; - private Dictionary _ms1Scans; - private PeakIndexingEngine _peakIndexingEngine; + internal Dictionary _ms1Scans; + internal PeakIndexingEngine _peakIndexingEngine; public FlashLfqEngine( List allIdentifications, @@ -265,7 +271,7 @@ public PeakIndexingEngine GetIndexingEngine() /// If the sequence is modified and the modification has an unknown chemical formula, /// averagine is used for the modified part /// - private void CalculateTheoreticalIsotopeDistributions() + internal void CalculateTheoreticalIsotopeDistributions() { _modifiedSequenceToIsotopicDistribution = new Dictionary>(); @@ -570,7 +576,7 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) return (predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, range: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); } - #region mbr + /// /// This method maps identified peaks from other chromatographic runs ("donors") onto /// the defined chromatographic run ("acceptor"). The goal is to reduce the number of missing @@ -614,9 +620,13 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) _peakIndexingEngine.DeserializeIndex(idAcceptorFile); // these are the analytes already identified in this run. we don't need to try to match them from other runs - var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks.Where(p => p.IsotopicEnvelopes.Any()) + var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks + .Where(p => p.IsotopicEnvelopes.Any()) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); - var acceptorFileLogIntensities = acceptorFileIdentifiedPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList(); + var acceptorFileLogIntensities = acceptorFileIdentifiedPeaks + .Where(p => p.Intensity > 0) + .Select(p => Math.Log(p.Intensity, 2)) + .ToList(); double medianAcceptorLogIntensity = acceptorFileLogIntensities.Median(); Normal intensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36); @@ -665,7 +675,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) var donorFileLogIntensities = idDonorPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList(); double medianDonorLogIntensity = donorFileLogIntensities.Median(); - #endregion // Find the difference in peptide intensities between donor and acceptor files // this intensity score creates a conservative bias in MBR @@ -867,6 +876,121 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } RunErrorChecking(idAcceptorFile); + + //MbrDecoySearch(); + } + + internal void MbrDecoySearch(SpectraFileInfo acceptorFile) + { + var decoyPeptides = new List(); + ChromatographicPeak donorPeak = null; + PpmTolerance mbrTol = new PpmTolerance(10); + + // Should do alignment with like 100 anchor peptides, build a spline, then use that to inform the + // rt hypothesis + + // get the MS1 scan info for this region so we can look up indexed peaks + Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[acceptorFile]; + Ms1ScanInfo start = ms1ScanInfos[0]; + Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + for (int j = 0; j < ms1ScanInfos.Length; j++) + { + Ms1ScanInfo scan = ms1ScanInfos[j]; + //if (scan.RetentionTime <= lowerRangeRtHypothesis) + start = scan; + //if (scan.RetentionTime >= upperRangeRtHypothesis) + //{ + // end = scan; + // break; + //} + } + + // now we've identified the region in the chromatography this analyte should appear. + // we need to check for peaks in the region using ppm tolerance and isotope pattern matching + var chargesToMatch = donorPeak.Identifications.Select(p => p.PrecursorChargeState).Distinct().ToList(); + if (!chargesToMatch.Contains(donorPeak.Apex.ChargeState)) + { + chargesToMatch.Add(donorPeak.Apex.ChargeState); + } + + Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); + + // TODO: For decoys, need to increase ppm tolerance until something is found or a maximum is reached + // Decoys, just do one charge state + foreach (int z in chargesToMatch) + { + List chargeXic = new List(); + + double adjustment = mbrTol.GetRange(donorIdentification.PeakfindingMass).Width; + + //peakfinding loop + // for every loop after first, adjust target such that it sits in the middle of the tolerance range + // below or above the previous target range. + // e.g., 0, -1, 1, -2, 2 + + + for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) + { + IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(donorIdentification.PeakfindingMass, j, mbrTol, z); + if (peak != null) + chargeXic.Add(peak); + } + if (!chargeXic.Any()) + continue; // goto peakfinidng loop + + List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z); + + // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. + // remove the clustered isotopic envelopes from the list of seeds after each iteration + // while (chargeEnvelopes.Any()) + // { + // ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, apexToAcceptorFilePeak, ppmDistribution, mbrTol, medianAcceptorLogIntensity, intensityDistribution, foldChangeDistribution, rtInfo, acceptorFileRtHypothesis, rtScoringDistribution, donorIdentification, z, chargeEnvelopes); + // if (acceptorPeak == null) + // continue; + + // // save the peak hypothesis + // // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak + // if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) + // { + // if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) + // { + // var samePeakSameSequence = existing + // .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); + + // if (samePeakSameSequence != null) + // { + // samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; + // samePeakSameSequence.Identifications.Add(donorIdentification); + // } + // else + // { + // existing.Add(acceptorPeak); + // } + // } + // else + // { + // mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); + // } + // } + // else + // { + // matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); + // matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); + // } + // } + } + + + //run once for each acceptor file + // for each decoy peptide + // get rt hypothesis + range + // get all MS1 scans in range (i.e., find indices + // pick a charge state where decoy would have reasonable mass (350 < m/z < 1600) + // peak pick + // loop where tolerance gets progressively larger until something is found + // find isotopic envelopes with reduced requirement for pearson correlation + // do need some maximum tolerance value. 10k ppm? + // report + store these peaks } private void CalculateFoldChangeBetweenFiles(SpectraFileInfo idAcceptorFile, List idDonorPeaks, List listOfFoldChangesBetweenTheFiles) @@ -909,7 +1033,20 @@ private void CalculateFoldChangeBetweenFiles(SpectraFileInfo idAcceptorFile, Lis // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. // remove the clustered isotopic envelopes from the list of seeds after each iteration - internal ChromatographicPeak FindAcceptorPeak(SpectraFileInfo idAcceptorFile, Dictionary apexToAcceptorFilePeak, Normal ppmDistribution, Tolerance mbrTol, double medianAcceptorLogIntensity, Normal intensityDistribution, Normal foldChangeDistribution, (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo, double acceptorFileRtHypothesis, Normal rtScoringDistribution, Identification donorIdentification, int z, List chargeEnvelopes) + internal ChromatographicPeak FindAcceptorPeak( + SpectraFileInfo idAcceptorFile, + Dictionary apexToAcceptorFilePeak, + Normal ppmDistribution, + Tolerance mbrTol, + double medianAcceptorLogIntensity, + Normal intensityDistribution, + Normal foldChangeDistribution, + (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo, + double acceptorFileRtHypothesis, + Normal rtScoringDistribution, + Identification donorIdentification, + int z, + List chargeEnvelopes) { var acceptorPeak = new ChromatographicPeak(donorIdentification, true, idAcceptorFile); IsotopicEnvelope seedEnv = chargeEnvelopes.First(); @@ -965,6 +1102,122 @@ internal ChromatographicPeak FindAcceptorPeak(SpectraFileInfo idAcceptorFile, Di return acceptorPeak; } + internal ChromatographicPeak FindDecoyPeak( + SpectraFileInfo idAcceptorFile, + Dictionary apexToAcceptorFilePeak, + Tolerance mbrTol, + (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo, + Identification donorIdentification) + { + // this is the RT in the acceptor file to look around to find this analyte + double acceptorFileRtHypothesis = rtInfo.predictedRt; + double lowerRangeRtHypothesis = acceptorFileRtHypothesis - (rtInfo.range / 2.0); + double upperRangeRtHypothesis = acceptorFileRtHypothesis + (rtInfo.range / 2.0); + + // get the MS1 scan info for this region so we can look up indexed peaks + Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; + Ms1ScanInfo start = ms1ScanInfos[0]; + Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + for (int j = 0; j < ms1ScanInfos.Length; j++) + { + Ms1ScanInfo scan = ms1ScanInfos[j]; + if (scan.RetentionTime <= lowerRangeRtHypothesis) + start = scan; + if (scan.RetentionTime >= upperRangeRtHypothesis) + { + end = scan; + break; + } + } + + // Checking charge states 2 - 4 + List chargeStateAcceptors = new(); + for(int z = 2; z <= 4; z++) + { + int searchCount = 0; + List acceptorPeakCandidates = new(); + while (acceptorPeakCandidates.Count < 1) + { + if (searchCount > 1000) break; + + // Select the peakFindingMass that will be used for this round of search + int searchRangeCoeff = (searchCount + 2 - 1) / 2; // Integer division that returns ceiling: https://stackoverflow.com/questions/17944/how-to-round-up-the-result-of-integer-division + searchRangeCoeff = searchCount % 2 == 0 ? searchRangeCoeff : -1 * searchRangeCoeff; + double peakFindingMass = donorIdentification.PeakfindingMass + + mbrTol.GetRange(donorIdentification.PeakfindingMass.ToMz(z)).Width * searchRangeCoeff; + + // Pull every imsPeak in the given time range for the peak finding mass + List fullRangeXic = new List(); + for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) + { + IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(peakFindingMass, j, mbrTol, z); + if (peak != null) + fullRangeXic.Add(peak); + } + if (!fullRangeXic.Any()) + { + searchCount++; + continue; + } + + // Find peaks in the XIC + List seedEnvelopes = GetIsotopicEnvelopes(fullRangeXic, donorIdentification, z); + if (!seedEnvelopes.Any()) + { + searchCount++; + continue; + } + while (seedEnvelopes.Any()) + { + IsotopicEnvelope seedEnvelope = seedEnvelopes.First(); + // peak has already been identified by MSMS - skip it + if (apexToAcceptorFilePeak.ContainsKey(seedEnvelope.IndexedPeak)) + { + seedEnvelopes.Remove(seedEnvelope); + continue; + } + + var xic = Peakfind(seedEnvelope.IndexedPeak.RetentionTime, peakFindingMass, z, idAcceptorFile, mbrTol); + List localEnvelopes = GetIsotopicEnvelopes(xic, donorIdentification, z); + if (localEnvelopes.Count() < 3) + { + seedEnvelopes.Remove(seedEnvelope); + continue; + } + ChromatographicPeak acceptorPeak = new ChromatographicPeak(donorIdentification, true, idAcceptorFile); + acceptorPeak.IsotopicEnvelopes.AddRange(localEnvelopes); + acceptorPeak.CalculateIntensityForThisFeature(Integrate); + //acceptorPeak.SetRtWindow(acceptorFileRtHypothesis, rtInfo.rtSd, rtInfo.rtInterquartileRange); + + CutPeak(acceptorPeak, seedEnvelope.IndexedPeak.RetentionTime); + acceptorPeakCandidates.Add(acceptorPeak); + + var claimedPeaks = new HashSet(acceptorPeak.IsotopicEnvelopes.Select(p => p.IndexedPeak)); + if (claimedPeaks.Max(p => p.ZeroBasedMs1ScanIndex) < seedEnvelopes.Min(e => e.IndexedPeak.ZeroBasedMs1ScanIndex)) + break; + seedEnvelopes.Remove(seedEnvelope); + seedEnvelopes.RemoveAll(p => claimedPeaks.Contains(p.IndexedPeak)); + } + searchCount++; + } + + if (!acceptorPeakCandidates.Any()) + continue; + + // Best peak is selected with a shitty heuristic, summing the isotopic correlation of all envelopes + ChromatographicPeak bestPeak = acceptorPeakCandidates.MinBy(peak => + Math.Abs(peak.Identifications.First().PeakfindingMass - peak.Apex.IndexedPeak.Mz.ToMass(peak.Apex.ChargeState))); + chargeStateAcceptors.Add(bestPeak); + } + + if (!chargeStateAcceptors.Any()) return null; + + ChromatographicPeak bestOverallPeak = chargeStateAcceptors.MinBy(peak => + Math.Abs(peak.Identifications.First().PeakfindingMass - peak.Apex.IndexedPeak.Mz.ToMass(peak.Apex.ChargeState))); + bestOverallPeak.MbrScore = -1; + return bestOverallPeak; + } + /// /// Used by the match-between-runs algorithm to determine systematic retention time drifts between /// chromatographic runs. @@ -1104,7 +1357,11 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) /// /// List of imsPeaks, where the mass of each peak is the peak finding mass (most abundant isotope) /// A list of IsotopicEnvelopes, where each envelope contains the sum of the isotopic peak intensities from one scan - public List GetIsotopicEnvelopes(List xic, Identification identification, int chargeState) + public List GetIsotopicEnvelopes( + List xic, + Identification identification, + int chargeState, + bool decoySearch = false) { var isotopicEnvelopes = new List(); var isotopeMassShifts = _modifiedSequenceToIsotopicDistribution[identification.ModifiedSequence]; @@ -1193,9 +1450,22 @@ public List GetIsotopicEnvelopes(List continue; } + if (decoySearch) + { + isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum())); + } // Check that the experimental envelope matches the theoretical - if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance)) + if (decoySearch | CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance, out var corr)) { + if(decoySearch) + { + if(corr > 0.3) + { + isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum(), corr)); + } + + continue; + } // impute unobserved isotope peak intensities // TODO: Figure out why value imputation is performed. Build a toggle? for (int i = 0; i < experimentalIsotopeIntensities.Length; i++) @@ -1206,8 +1476,7 @@ public List GetIsotopicEnvelopes(List } } - isotopicEnvelopes.Add(new IsotopicEnvelope( - peak, chargeState, experimentalIsotopeIntensities.Sum())); + isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum(), corr)); } } @@ -1228,11 +1497,12 @@ public bool CheckIsotopicEnvelopeCorrelation( Dictionary> massShiftToIsotopePeaks, IndexedMassSpectralPeak peak, int chargeState, - Tolerance isotopeTolerance) + Tolerance isotopeTolerance, + out double pearsonCorrelation) { - double corr = Correlation.Pearson( - massShiftToIsotopePeaks[0].Select(p => p.expIntensity), - massShiftToIsotopePeaks[0].Select(p => p.theorIntensity)); + pearsonCorrelation = Correlation.Pearson( + massShiftToIsotopePeaks[0].Select(p => p.expIntensity), + massShiftToIsotopePeaks[0].Select(p => p.theorIntensity)); // check correlation of experimental isotope intensities to the theoretical abundances // check for unexpected peaks @@ -1278,7 +1548,7 @@ public bool CheckIsotopicEnvelopeCorrelation( // If these conditions are true, the isotopic envelope matches the expected envelope better than // either alternative (i.e., +/- missed mono-isotopic) - return corr > 0.7 && corrShiftedLeft - corrWithPadding < 0.1 && corrShiftedRight - corrWithPadding < 0.1; + return pearsonCorrelation > 0.7 && corrShiftedLeft - corrWithPadding < 0.1 && corrShiftedRight - corrWithPadding < 0.1; } /// diff --git a/mzLib/FlashLFQ/IsotopicEnvelope.cs b/mzLib/FlashLFQ/IsotopicEnvelope.cs index 09d7207d7..0875f340b 100644 --- a/mzLib/FlashLFQ/IsotopicEnvelope.cs +++ b/mzLib/FlashLFQ/IsotopicEnvelope.cs @@ -10,6 +10,7 @@ public class IsotopicEnvelope /// public readonly IndexedMassSpectralPeak IndexedPeak; public readonly int ChargeState; + public readonly double PearsonCorrelation; public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity) { @@ -18,6 +19,14 @@ public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeStat Intensity = intensity / chargeState; } + public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity, double pearsonCorrelation) + { + IndexedPeak = monoisotopicPeak; + ChargeState = chargeState; + Intensity = intensity / chargeState; + PearsonCorrelation = pearsonCorrelation; + } + /// /// The summed intensity of all isotope peaks detected in one MS1 scan. This sum may contain /// imputed intensity values for expected isotopes that weren't observed, but only if the observed diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs new file mode 100644 index 000000000..b8ce7f152 --- /dev/null +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -0,0 +1,249 @@ +using Chemistry; +using FlashLFQ; +using MassSpectrometry; +using MathNet.Numerics.Distributions; +using MathNet.Numerics.Statistics; +using MzLibUtil; +using NUnit.Framework; +using Proteomics.AminoAcidPolymer; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Easy.Common.Extensions; +using Test.FileReadingTests; +using UsefulProteomicsDatabases; +using ChromatographicPeak = FlashLFQ.ChromatographicPeak; +using Stopwatch = System.Diagnostics.Stopwatch; +using Peptide = Proteomics.AminoAcidPolymer.Peptide; + + +namespace Test +{ + [TestFixture] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] + internal class MbrTargetDecoyTest + { + [Test] + [TestCase(0, ExpectedResult = 0)] + [TestCase(1, ExpectedResult = -1)] + [TestCase(2, ExpectedResult = 1)] + [TestCase(3, ExpectedResult = -2)] + [TestCase(5, ExpectedResult = -3)] + [TestCase(6, ExpectedResult = 3)] + public static int TestDecoySearchFlipFlop(int searchCount) + { + // Integer division take ceiling: https://stackoverflow.com/questions/17944/how-to-round-up-the-result-of-integer-division + int result = (searchCount + 2 - 1) / 2; + result = searchCount % 2 == 0 ? result : -1 * result; + + return result; + } + + [Test] + public static void DecoyPeakFindTrial() + { + string decoyPeptidePath = @"C:\Users\Alex\Source\Repos\chronologer\chronologer-main(noChange)\Arabidopsis_Tryptic_Peptides_Slice.tsv"; + string spectraFilePath = @"D:\SingleCellDataSets\Organoid\Calibration_MM_320\Task1-CalibrateTask\HFL1SC_Unhealthy_CH2_J5-calib.mzML"; + ProteinGroup pg = new ProteinGroup("xyz", "x", "z"); + List pgs = new List { pg }; + SpectraFileInfo j5 = new SpectraFileInfo(spectraFilePath, "A", 1, 1, 1); + double rtRange = 4.0; + + List decoys = new(); + Loaders.LoadElements(); + + using (StreamReader reader = new StreamReader(decoyPeptidePath)) + { + reader.ReadLine(); + while(!reader.EndOfStream) + { + string[] lineSplit = reader.ReadLine().Split('\t'); + if (double.Parse(lineSplit[4]) > 57.5) continue; + + Peptide peptide = new Peptide(sequence: lineSplit[0]); + Identification decoyId = new Identification(j5, peptide.BaseSequence, peptide.BaseSequence, + peptide.MonoisotopicMass, ms2RetentionTimeInMinutes: double.Parse(lineSplit[4]), + chargeState: peptide.MonoisotopicMass > 1600 ? 3 : 2, pgs); + decoys.Add(decoyId); + } + } + + + FlashLfqEngine engine = new FlashLfqEngine( + decoys + ); + + engine.CalculateTheoreticalIsotopeDistributions(); + engine._ms1Scans = new Dictionary(); + engine._peakIndexingEngine.IndexMassSpectralPeaks(j5, true, engine._ms1Scans); + + + List decoyPeaks = new(); + List massDifferences = new(); + Dictionary apexPeakDict = new(); + int decoysConsidered = 0; + Random rnd = new Random(); + foreach (Identification decoy in decoys) + { + + int rndInt = rnd.Next(1, 13); + // Eliminate ~ half of the decoys with mass greater than 2000 daltons + // This is an ad-hoc way of matching target and decoy mass distribution + if (decoy.PeakfindingMass > 1800 && rndInt % 2 == 0) continue; + else if (decoy.PeakfindingMass > 1400 && rndInt % 3 == 0) continue; + + if (decoy.Ms2RetentionTimeInMinutes < 8 && rndInt < 11) continue; + + PpmTolerance tolerance = new PpmTolerance(10); + var foundPeak = engine.FindDecoyPeak( + j5, + apexPeakDict, + tolerance, + (decoy.Ms2RetentionTimeInMinutes, rtRange, null, null), + decoy); + + if (foundPeak != null) + { + decoyPeaks.Add(foundPeak); + massDifferences.Add( + Math.Abs( + decoy.PeakfindingMass - foundPeak.Apex.IndexedPeak.Mz.ToMass(foundPeak.Apex.ChargeState) + )); + } + + decoysConsidered++; + if (decoyPeaks.Count >= 750) break; + } + + int placeholder = 0; + + double massDiffMean = massDifferences.Select(m => Math.Abs(m)).Average(); + double envelopeCountMean = decoyPeaks.Select(peak => peak.IsotopicEnvelopes.Count).Average(); + double intensityMean = decoyPeaks.Select(peak => peak.IsotopicEnvelopes.Select(e => e.Intensity).Average()).Average(); + + placeholder = 1; + + // Repeat, but for targets + string targetPeptidePath = @"C:\Users\Alex\Source\Repos\chronologer\chronologer-main(noChange)\Unhealthy_CH2_J5_MBR_Predicted.tsv"; + //For MBR Predicted file + int fullSeqCol = 3; + int massCol = 5; + int rtColumn = 24; + + List targetIDs = new(); + using (StreamReader reader = new StreamReader(targetPeptidePath)) + { + reader.ReadLine(); + while (!reader.EndOfStream) + { + string[] lineSplit = reader.ReadLine().Split('\t'); + if (lineSplit[fullSeqCol].Contains('[')) continue; + if (double.Parse(lineSplit[rtColumn]) > 60) continue; + + + Peptide peptide = new Peptide(sequence: lineSplit[fullSeqCol]); + Identification targetId = new Identification(j5, peptide.BaseSequence, peptide.BaseSequence, + peptide.MonoisotopicMass, ms2RetentionTimeInMinutes: double.Parse(lineSplit[rtColumn]), + chargeState: peptide.MonoisotopicMass > 1600 ? 3 : 2, pgs); + targetIDs.Add(targetId); + } + } + + + engine = new FlashLfqEngine( + targetIDs + ); + + engine.CalculateTheoreticalIsotopeDistributions(); + engine._ms1Scans = new Dictionary(); + engine._peakIndexingEngine.IndexMassSpectralPeaks(j5, true, engine._ms1Scans); + + + List targetPeaks = new(); + List massDifferencesTarget = new(); + Dictionary apexPeakDictTarget = new(); + int targetsConsidered = 0; + foreach (Identification target in targetIDs) + { + PpmTolerance tolerance = new PpmTolerance(10); + var foundPeak = engine.FindDecoyPeak( + j5, + apexPeakDictTarget, + tolerance, + (target.Ms2RetentionTimeInMinutes, rtRange, null, null), + target); + + if (foundPeak != null) + { + targetPeaks.Add(foundPeak); + massDifferencesTarget.Add( + Math.Abs( + target.PeakfindingMass - foundPeak.Apex.IndexedPeak.Mz.ToMass(foundPeak.Apex.ChargeState) + )); + } + + targetsConsidered++; + if (targetPeaks.Count >= 750) break; + } + + double massDiffMeanT = massDifferencesTarget.Select(m => Math.Abs(m)).Average(); + double envelopeCountMeanT = targetPeaks.Select(peak => peak.IsotopicEnvelopes.Count).Average(); + double intensityMeanT = targetPeaks.Select(peak => peak.IsotopicEnvelopes.Select(e => e.Intensity).Average()).Average(); + + placeholder = 2; + + using(StreamWriter writer = new StreamWriter(@"C:\Users\Alex\Desktop\MBR_10_30\Take8_MbrTargetRT_4MinWindow.tsv")) + { + string[] header = new string[] + { + "Sequence", + "Target/Decoy", + "Theoretical Peak Finding Mass", + "Found Mass", + "Number of Scans", + "Apex Intensity", + "Retention Time", + "Predicted Retention Time" + }; + writer.WriteLine(string.Join('\t', header)); + + foreach (var decoy in decoyPeaks) + { + double peakFindingMass = decoy.Identifications.First().PeakfindingMass; + header = new string[] + { + decoy.Identifications.First().BaseSequence, + "D", + decoy.Identifications.First().PeakfindingMass.ToString(), + decoy.Apex.IndexedPeak.Mz.ToMass(decoy.Apex.ChargeState).ToString(), + decoy.IsotopicEnvelopes.Count.ToString(), + decoy.Apex.Intensity.ToString(), + decoy.Apex.IndexedPeak.RetentionTime.ToString(), + decoy.Identifications.First().Ms2RetentionTimeInMinutes.ToString() + }; + writer.WriteLine(string.Join('\t', header)); + } + + foreach (var target in targetPeaks) + { + double peakFindingMass = target.Identifications.First().PeakfindingMass; + header = new string[] + { + target.Identifications.First().BaseSequence, + "T", + target.Identifications.First().PeakfindingMass.ToString(), + target.Apex.IndexedPeak.Mz.ToMass(target.Apex.ChargeState).ToString(), + target.IsotopicEnvelopes.Count.ToString(), + target.Apex.Intensity.ToString(), + target.Apex.IndexedPeak.RetentionTime.ToString(), + target.Identifications.First().Ms2RetentionTimeInMinutes.ToString() + }; + writer.WriteLine(string.Join('\t', header)); + } + } + + } + + } +} From 31ff285627737c19d78a60904b7a7f407067555e Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 30 Jan 2024 12:06:08 -0600 Subject: [PATCH 04/55] Removed decoy toggle from GetIsotopicEnvelopes --- mzLib/FlashLFQ/FlashLfqEngine.cs | 33 +++++++++++++------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index bf496d369..ad37f11aa 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -721,7 +721,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { Ms1ScanInfo scan = ms1ScanInfos[j]; if (scan.RetentionTime <= lowerRangeRtHypothesis) + { start = scan; + } if (scan.RetentionTime >= upperRangeRtHypothesis) { end = scan; @@ -739,8 +741,10 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - // TODO: For decoys, need to increase ppm tolerance until something is found or a maximum is reached - // Decoys, just do one charge state + // Grab the retention time of a random peptide in the donor file + // If it is not outside of the rtInfo.predictedRT +/- rtInfo.range (twice the width of actual window), redraw + + foreach (int z in chargesToMatch) { List chargeXic = new List(); @@ -760,7 +764,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // remove the clustered isotopic envelopes from the list of seeds after each iteration while (chargeEnvelopes.Any()) { - ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, apexToAcceptorFilePeak, ppmDistribution, mbrTol, medianAcceptorLogIntensity, intensityDistribution, foldChangeDistribution, rtInfo, acceptorFileRtHypothesis, rtScoringDistribution, donorIdentification, z, chargeEnvelopes); + ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, apexToAcceptorFilePeak, ppmDistribution, + mbrTol, medianAcceptorLogIntensity, intensityDistribution, foldChangeDistribution, rtInfo, + acceptorFileRtHypothesis, rtScoringDistribution, donorIdentification, z, chargeEnvelopes); if (acceptorPeak == null) continue; @@ -1046,7 +1052,8 @@ internal ChromatographicPeak FindAcceptorPeak( Normal rtScoringDistribution, Identification donorIdentification, int z, - List chargeEnvelopes) + List chargeEnvelopes, + bool isDecoy = false) { var acceptorPeak = new ChromatographicPeak(donorIdentification, true, idAcceptorFile); IsotopicEnvelope seedEnv = chargeEnvelopes.First(); @@ -1360,8 +1367,7 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) public List GetIsotopicEnvelopes( List xic, Identification identification, - int chargeState, - bool decoySearch = false) + int chargeState) { var isotopicEnvelopes = new List(); var isotopeMassShifts = _modifiedSequenceToIsotopicDistribution[identification.ModifiedSequence]; @@ -1450,22 +1456,9 @@ public List GetIsotopicEnvelopes( continue; } - if (decoySearch) - { - isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum())); - } // Check that the experimental envelope matches the theoretical - if (decoySearch | CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance, out var corr)) + if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance, out var corr)) { - if(decoySearch) - { - if(corr > 0.3) - { - isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum(), corr)); - } - - continue; - } // impute unobserved isotope peak intensities // TODO: Figure out why value imputation is performed. Build a toggle? for (int i = 0; i < experimentalIsotopeIntensities.Length; i++) From dda57b692975462266fca6b0b86395800d8ba205 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 30 Jan 2024 14:06:55 -0600 Subject: [PATCH 05/55] About to start MbrScorer refactor --- mzLib/FlashLFQ/FlashLfqEngine.cs | 187 ++++++++++++++++++++++++++++--- mzLib/FlashLFQ/MbrScorer.cs | 77 +++++++++++++ mzLib/FlashLFQ/RtInfo.cs | 24 ++++ 3 files changed, 273 insertions(+), 15 deletions(-) create mode 100644 mzLib/FlashLFQ/MbrScorer.cs create mode 100644 mzLib/FlashLFQ/RtInfo.cs diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index ad37f11aa..cde434892 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -475,7 +475,7 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) _results.Peaks[fileInfo].AddRange(chromatographicPeaks.ToList()); } - internal (double predictedRt, double range, double? rtSd, double? rtInterquartileRange)? PredictRetentionTime( + internal RtInfo PredictRetentionTime( RetentionTimeCalibDataPoint[] rtCalibrationCurve, ChromatographicPeak donorPeak, SpectraFileInfo acceptorFile, SpectraFileInfo donorFile, @@ -574,7 +574,42 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) rtRange = Math.Min(rtRange, MbrRtWindow); - return (predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, range: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); + } + + private MbrScorer BuildMbrScorer(List acceptorFileIdentifiedPeaks, out Tolerance fileSpecificMbrTolerance) + { + // Ppm distribution + var apexToAcceptorFilePeakDict = new Dictionary(); + List ppmErrors = new List(); + foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null)) + { + if (!apexToAcceptorFilePeakDict.ContainsKey(peak.Apex.IndexedPeak)) + { + apexToAcceptorFilePeakDict.Add(peak.Apex.IndexedPeak, peak); + } + + ppmErrors.Add(peak.MassError); + } + if (ppmErrors.Count < 3) + { + fileSpecificMbrTolerance = null; + return null; + } + double ppmSpread = ppmErrors.Count > 30 ? ppmErrors.InterquartileRange() / 1.36 : ppmErrors.StandardDeviation(); + Normal ppmDistribution = new Normal(ppmErrors.Median(), ppmSpread); + double fileSpecificMbrPpmTolerance = Math.Min(Math.Abs(ppmErrors.Median()) + ppmSpread * 4, MbrPpmTolerance); + fileSpecificMbrTolerance = new PpmTolerance(fileSpecificMbrPpmTolerance); // match between runs PPM tolerance + + // Intensity Distribution + var acceptorFileLogIntensities = acceptorFileIdentifiedPeaks + .Where(p => p.Intensity > 0) + .Select(p => Math.Log(p.Intensity, 2)) + .ToList(); + double medianAcceptorLogIntensity = acceptorFileLogIntensities.Median(); + Normal intensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36); + + return new MbrScorer(apexToAcceptorFilePeakDict, ppmDistribution, intensityDistribution) } /// @@ -594,6 +629,8 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // acceptor file known peaks var acceptorFileIdentifiedPeaks = _results.Peaks[idAcceptorFile]; + + #region ppmScore var apexToAcceptorFilePeak = new Dictionary(); // Ppm distribution @@ -615,6 +652,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) Normal ppmDistribution = new Normal(ppmErrors.Median(), ppmSpread); double filespecificMbrPpmTolerance = Math.Min(Math.Abs(ppmErrors.Median()) + ppmSpread * 4, MbrPpmTolerance); Tolerance mbrTol = new PpmTolerance(filespecificMbrPpmTolerance); // match between runs PPM tolerance + #endregion // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier _peakIndexingEngine.DeserializeIndex(idAcceptorFile); @@ -623,12 +661,15 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks .Where(p => p.IsotopicEnvelopes.Any()) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); + + #region IntensityScore var acceptorFileLogIntensities = acceptorFileIdentifiedPeaks .Where(p => p.Intensity > 0) .Select(p => Math.Log(p.Intensity, 2)) .ToList(); double medianAcceptorLogIntensity = acceptorFileLogIntensities.Median(); Normal intensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36); + #endregion HashSet thisFilesMsmsIdentifiedProteins = new HashSet(); if (RequireMsmsIdInCondition) @@ -673,6 +714,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) .Distinct() .Count() > 1; + MbrScorer scorer = BuildMbrScorer() + + var donorFileLogIntensities = idDonorPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList(); double medianDonorLogIntensity = donorFileLogIntensities.Median(); @@ -701,17 +745,20 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) for (int i = range.Item1; i < range.Item2; i++) { + ChromatographicPeak donorPeak = idDonorPeaks[i]; // TODO: Add a toggle that set rtRange to be maximum width var predictionResults = PredictRetentionTime(rtCalibrationCurve, donorPeak, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (predictionResults == null) continue; - (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo = ((double, double, double?, double?))predictionResults; + RtInfo rtInfo = predictionResults; // this is the RT in the acceptor file to look around to find this analyte - double acceptorFileRtHypothesis = rtInfo.predictedRt; - double lowerRangeRtHypothesis = acceptorFileRtHypothesis - (rtInfo.range / 2.0); - double upperRangeRtHypothesis = acceptorFileRtHypothesis + (rtInfo.range / 2.0); - Normal rtScoringDistribution = new Normal(acceptorFileRtHypothesis, rtInfo.range / 6); + double acceptorFileRtHypothesis = rtInfo.PredictedRt; + double lowerRangeRtHypothesis = acceptorFileRtHypothesis - (rtInfo.Width / 2.0); + double upperRangeRtHypothesis = acceptorFileRtHypothesis + (rtInfo.Width / 2.0); + Normal rtScoringDistribution = new Normal(acceptorFileRtHypothesis, rtInfo.Width / 6); + + // TODO: add the decoy search here // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; @@ -744,7 +791,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // Grab the retention time of a random peptide in the donor file // If it is not outside of the rtInfo.predictedRT +/- rtInfo.range (twice the width of actual window), redraw - foreach (int z in chargesToMatch) { List chargeXic = new List(); @@ -845,7 +891,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) }); } - // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -1037,8 +1082,121 @@ private void CalculateFoldChangeBetweenFiles(SpectraFileInfo idAcceptorFile, Lis } } - // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. - // remove the clustered isotopic envelopes from the list of seeds after each iteration + internal void FindAllAcceptorPeaks( + SpectraFileInfo idAcceptorFile, + double lowerRangeRtHypothesis, + double upperRangeRtHypothesis, + ChromatographicPeak donorPeak, + Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific + ) + { + // get the MS1 scan info for this region so we can look up indexed peaks + Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; + Ms1ScanInfo start = ms1ScanInfos[0]; + Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + for (int j = 0; j < ms1ScanInfos.Length; j++) + { + Ms1ScanInfo scan = ms1ScanInfos[j]; + if (scan.RetentionTime <= lowerRangeRtHypothesis) + { + start = scan; + } + if (scan.RetentionTime >= upperRangeRtHypothesis) + { + end = scan; + break; + } + } + + // now we've identified the region in the chromatography this analyte should appear. + // we need to check for peaks in the region using ppm tolerance and isotope pattern matching + var chargesToMatch = donorPeak.Identifications.Select(p => p.PrecursorChargeState).Distinct().ToList(); + if (!chargesToMatch.Contains(donorPeak.Apex.ChargeState)) + { + chargesToMatch.Add(donorPeak.Apex.ChargeState); + } + + Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); + + // Grab the retention time of a random peptide in the donor file + // If it is not outside of the rtInfo.predictedRT +/- rtInfo.range (twice the width of actual window), redraw + + foreach (int z in chargesToMatch) + { + List chargeXic = new List(); + + for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) + { + IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(donorIdentification.PeakfindingMass, j, mbrTol, z); + if (peak != null) + chargeXic.Add(peak); + } + if (!chargeXic.Any()) + continue; + + List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z); + + // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. + // remove the clustered isotopic envelopes from the list of seeds after each iteration + while (chargeEnvelopes.Any()) + { + ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, apexToAcceptorFilePeak, ppmDistribution, + mbrTol, medianAcceptorLogIntensity, intensityDistribution, foldChangeDistribution, rtInfo, + acceptorFileRtHypothesis, rtScoringDistribution, donorIdentification, z, chargeEnvelopes); + if (acceptorPeak == null) + continue; + + // save the peak hypothesis + // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak + if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) + { + if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) + { + var samePeakSameSequence = existing + .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); + + if (samePeakSameSequence != null) + { + samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; + samePeakSameSequence.Identifications.Add(donorIdentification); + } + else + { + existing.Add(acceptorPeak); + } + } + else + { + mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); + } + } + else + { + matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); + matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); + } + } + } + } + + /// + /// Grabs the first isotopic envelope in the list of chargeEnvelopes as a potential seed for a chromatographic peak. + /// remove the isotopic envelope from chargeEnvelopes afterward. + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// An acceptor chromatographic peak, unless the peak found was already linked to an MS/MS id, in which case it return null. internal ChromatographicPeak FindAcceptorPeak( SpectraFileInfo idAcceptorFile, Dictionary apexToAcceptorFilePeak, @@ -1047,13 +1205,12 @@ internal ChromatographicPeak FindAcceptorPeak( double medianAcceptorLogIntensity, Normal intensityDistribution, Normal foldChangeDistribution, - (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo, + RtInfo rtInfo, double acceptorFileRtHypothesis, Normal rtScoringDistribution, Identification donorIdentification, int z, - List chargeEnvelopes, - bool isDecoy = false) + List chargeEnvelopes) { var acceptorPeak = new ChromatographicPeak(donorIdentification, true, idAcceptorFile); IsotopicEnvelope seedEnv = chargeEnvelopes.First(); @@ -1062,7 +1219,7 @@ internal ChromatographicPeak FindAcceptorPeak( List bestChargeEnvelopes = GetIsotopicEnvelopes(xic, donorIdentification, z); acceptorPeak.IsotopicEnvelopes.AddRange(bestChargeEnvelopes); acceptorPeak.CalculateIntensityForThisFeature(Integrate); - acceptorPeak.SetRtWindow(acceptorFileRtHypothesis, rtInfo.rtSd, rtInfo.rtInterquartileRange); + acceptorPeak.SetRtWindow(acceptorFileRtHypothesis, rtInfo.RtSd, rtInfo.RtInterquartileRange); CutPeak(acceptorPeak, seedEnv.IndexedPeak.RetentionTime); diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs new file mode 100644 index 000000000..b4b6b4ccb --- /dev/null +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -0,0 +1,77 @@ +using MathNet.Numerics.Distributions; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ +{ + /// + /// This class takes in an intensity distribution, a log foldchange distribution, and a ppm distribution + /// unique to each donor file - acceptor file pair + /// + internal class MbrScorer + { + // Intensity and ppm distribution are specific to each acceptor file + private readonly Normal _logIntensityDistribution; + private readonly Normal _ppmDistribution; + // The logFcDistributions are unique to each donor file - acceptor file pair + private Dictionary _logFcDistributionDictionary; + + internal Dictionary ApexToAcceptorFilePeakDict { get; } + + /// + /// Takes in an intensity distribution, a log foldchange distribution, and a ppm distribution + /// unique to each donor file - acceptor file pair. These are used to score MBR matches + /// + internal MbrScorer(Dictionary apexToAcceptorFilePeakDict, + Normal ppmDistribution, Normal logIntensityDistribution) + { + _logIntensityDistribution = intensityDistribution; + _ppmDistribution = ppmDistribution; + _logFcDistributionDictionary = new(); + } + + internal double ScoreMbr(Normal rtDistribution, double retentionTime, double ppmError, double acceptorIntensity, ChromatographicPeak? donorPeak = null) + { + double intensityDensity; + if (donorPeak != null && acceptorIntensity != 0 && donorPeak.Intensity != 0 && + _logFcDistributionDictionary.TryGetValue(donorPeak.SpectraFileInfo, out var logFcDistribution)) + { + intensityDensity = logFcDistribution.Density( + Math.Log(acceptorIntensity, 2) - Math.Log(donorPeak.Intensity, 2) + ); + } + else + { + var logIntensity = Math.Log(acceptorIntensity, 2); + // I don't know what the if/else statement accomplishes. It feels like we should take the density regardless + // As it is, the score is artifically inflated for very intense peaks + if (logIntensity < _logIntensityDistribution.Median) + intensityDensity = _logIntensityDistribution.Density(logIntensity); + else + intensityDensity = _logIntensityDistribution.Density(_logIntensityDistribution.Mode); + } + + double intensityScore = DensityScoreConversion(intensityDensity); + double ppmScore = DensityScoreConversion(_ppmDistribution.Density(ppmError)); + double rtScore = DensityScoreConversion(rtDistribution.Density(retentionTime)); + + return ppmScore + rtScore + intensityScore; + } + + /// + /// Takes in the density of a normal distribution at a given point, and transforms it + /// by taking the log of the density plus the square root of the squared density plus one + /// This transformation was implemented in the original code, and we're unsure of the rationale + /// + /// A Normal distribution + /// The transformed score + private double DensityScoreConversion(double density) + { + return Math.Log(density + Math.Sqrt(Math.Pow(density, 2) + 1)); + } + + } +} diff --git a/mzLib/FlashLFQ/RtInfo.cs b/mzLib/FlashLFQ/RtInfo.cs new file mode 100644 index 000000000..1b8614f6a --- /dev/null +++ b/mzLib/FlashLFQ/RtInfo.cs @@ -0,0 +1,24 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ +{ + internal class RtInfo + { + public double PredictedRt { get; } + public double Width { get; } + public double? RtSd { get; } + public double? RtInterquartileRange { get; } + + internal RtInfo(double predictedRt, double width, double? rtSd, double? rtInterquartileRange) + { + PredictedRt = predictedRt; + Width = width; + RtSd = rtSd; + RtInterquartileRange = rtInterquartileRange; + } + } +} From ca9928a5cd4ade501aa00428c23de4c5fe4c8131 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 30 Jan 2024 14:53:38 -0600 Subject: [PATCH 06/55] Refactor of MBR is working succesfully --- mzLib/FlashLFQ/FlashLfqEngine.cs | 279 ++++--------------------------- mzLib/FlashLFQ/MbrScorer.cs | 71 +++++++- mzLib/FlashLFQ/RtInfo.cs | 6 +- 3 files changed, 106 insertions(+), 250 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index cde434892..6d7481de0 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -609,7 +609,7 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie double medianAcceptorLogIntensity = acceptorFileLogIntensities.Median(); Normal intensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36); - return new MbrScorer(apexToAcceptorFilePeakDict, ppmDistribution, intensityDistribution) + return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, intensityDistribution); } /// @@ -630,46 +630,17 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // acceptor file known peaks var acceptorFileIdentifiedPeaks = _results.Peaks[idAcceptorFile]; - #region ppmScore - var apexToAcceptorFilePeak = new Dictionary(); - - // Ppm distribution - List ppmErrors = new List(); - foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null)) - { - if (!apexToAcceptorFilePeak.ContainsKey(peak.Apex.IndexedPeak)) - { - apexToAcceptorFilePeak.Add(peak.Apex.IndexedPeak, peak); - } - - ppmErrors.Add(peak.MassError); - } - if (ppmErrors.Count < 3) - { - return; - } - double ppmSpread = ppmErrors.Count > 30 ? ppmErrors.InterquartileRange() / 1.36 : ppmErrors.StandardDeviation(); - Normal ppmDistribution = new Normal(ppmErrors.Median(), ppmSpread); - double filespecificMbrPpmTolerance = Math.Min(Math.Abs(ppmErrors.Median()) + ppmSpread * 4, MbrPpmTolerance); - Tolerance mbrTol = new PpmTolerance(filespecificMbrPpmTolerance); // match between runs PPM tolerance - #endregion - - // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier - _peakIndexingEngine.DeserializeIndex(idAcceptorFile); - // these are the analytes already identified in this run. we don't need to try to match them from other runs var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks .Where(p => p.IsotopicEnvelopes.Any()) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); - #region IntensityScore - var acceptorFileLogIntensities = acceptorFileIdentifiedPeaks - .Where(p => p.Intensity > 0) - .Select(p => Math.Log(p.Intensity, 2)) - .ToList(); - double medianAcceptorLogIntensity = acceptorFileLogIntensities.Median(); - Normal intensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36); - #endregion + MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol); + if (scorer == null) + return; + + // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier + _peakIndexingEngine.DeserializeIndex(idAcceptorFile); HashSet thisFilesMsmsIdentifiedProteins = new HashSet(); if (RequireMsmsIdInCondition) @@ -714,23 +685,13 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) .Distinct() .Count() > 1; - MbrScorer scorer = BuildMbrScorer() - - - var donorFileLogIntensities = idDonorPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList(); - double medianDonorLogIntensity = donorFileLogIntensities.Median(); - - // Find the difference in peptide intensities between donor and acceptor files - // this intensity score creates a conservative bias in MBR - List listOfFoldChangesBetweenTheFiles = new List(); + // We're only interested in the fold change if the conditions are different. Otherwise, we score based off of the intensities + // of the acceptor file if (_spectraFileInfo.Select(p => p.Condition).Distinct().Count() > 1 && idDonorFile.Condition != idAcceptorFile.Condition) { - CalculateFoldChangeBetweenFiles(idAcceptorFile, idDonorPeaks, listOfFoldChangesBetweenTheFiles); + scorer.CalculateFoldChangeBetweenFiles(idDonorPeaks); } - Normal foldChangeDistribution = listOfFoldChangesBetweenTheFiles.Count > 100 - ? new Normal(listOfFoldChangesBetweenTheFiles.Median(), listOfFoldChangesBetweenTheFiles.StandardDeviation()) - : null; // generate RT calibration curve RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile); @@ -745,111 +706,14 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) for (int i = range.Item1; i < range.Item2; i++) { - ChromatographicPeak donorPeak = idDonorPeaks[i]; // TODO: Add a toggle that set rtRange to be maximum width - var predictionResults = PredictRetentionTime(rtCalibrationCurve, donorPeak, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); - if (predictionResults == null) continue; - RtInfo rtInfo = predictionResults; - - // this is the RT in the acceptor file to look around to find this analyte - double acceptorFileRtHypothesis = rtInfo.PredictedRt; - double lowerRangeRtHypothesis = acceptorFileRtHypothesis - (rtInfo.Width / 2.0); - double upperRangeRtHypothesis = acceptorFileRtHypothesis + (rtInfo.Width / 2.0); - Normal rtScoringDistribution = new Normal(acceptorFileRtHypothesis, rtInfo.Width / 6); - - // TODO: add the decoy search here - - // get the MS1 scan info for this region so we can look up indexed peaks - Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; - Ms1ScanInfo start = ms1ScanInfos[0]; - Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; - for (int j = 0; j < ms1ScanInfos.Length; j++) - { - Ms1ScanInfo scan = ms1ScanInfos[j]; - if (scan.RetentionTime <= lowerRangeRtHypothesis) - { - start = scan; - } - if (scan.RetentionTime >= upperRangeRtHypothesis) - { - end = scan; - break; - } - } - - // now we've identified the region in the chromatography this analyte should appear. - // we need to check for peaks in the region using ppm tolerance and isotope pattern matching - var chargesToMatch = donorPeak.Identifications.Select(p => p.PrecursorChargeState).Distinct().ToList(); - if (!chargesToMatch.Contains(donorPeak.Apex.ChargeState)) - { - chargesToMatch.Add(donorPeak.Apex.ChargeState); - } + RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + if (rtInfo == null) continue; - Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - - // Grab the retention time of a random peptide in the donor file - // If it is not outside of the rtInfo.predictedRT +/- rtInfo.range (twice the width of actual window), redraw - - foreach (int z in chargesToMatch) - { - List chargeXic = new List(); - - for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) - { - IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(donorIdentification.PeakfindingMass, j, mbrTol, z); - if (peak != null) - chargeXic.Add(peak); - } - if (!chargeXic.Any()) - continue; - - List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z); - - // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. - // remove the clustered isotopic envelopes from the list of seeds after each iteration - while (chargeEnvelopes.Any()) - { - ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, apexToAcceptorFilePeak, ppmDistribution, - mbrTol, medianAcceptorLogIntensity, intensityDistribution, foldChangeDistribution, rtInfo, - acceptorFileRtHypothesis, rtScoringDistribution, donorIdentification, z, chargeEnvelopes); - if (acceptorPeak == null) - continue; - - // save the peak hypothesis - // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak - if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) - { - if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; - samePeakSameSequence.Identifications.Add(donorIdentification); - } - else - { - existing.Add(acceptorPeak); - } - } - else - { - mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); - } - } - else - { - matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); - matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); - } - } - } + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific); } - // merge results from different threads lock (matchBetweenRunsIdentifiedPeaks) { foreach (var kvp in matchBetweenRunsIdentifiedPeaksThreadSpecific) @@ -1044,51 +908,13 @@ internal void MbrDecoySearch(SpectraFileInfo acceptorFile) // report + store these peaks } - private void CalculateFoldChangeBetweenFiles(SpectraFileInfo idAcceptorFile, List idDonorPeaks, List listOfFoldChangesBetweenTheFiles) - { - var acceptorFileBestMsmsPeaks = new Dictionary(); - - IEnumerable acceptorPeaks = _results - .Peaks[idAcceptorFile] - .Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - - // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in acceptorPeaks) - { - if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > acceptorPeak.Intensity) - { - acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; - } - } - else - { - acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); - } - } - - foreach (var donorPeak in idDonorPeaks) - { - double donorPeakIntensity = donorPeak.Intensity; - if (acceptorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out var acceptorPeak)) - { - double acceptorPeakIntensity = acceptorPeak.Intensity; - - double intensityLogFoldChange = Math.Log(acceptorPeakIntensity, 2) - Math.Log(donorPeakIntensity, 2); - - listOfFoldChangesBetweenTheFiles.Add(intensityLogFoldChange); - } - } - } - internal void FindAllAcceptorPeaks( SpectraFileInfo idAcceptorFile, - double lowerRangeRtHypothesis, - double upperRangeRtHypothesis, + MbrScorer scorer, + RtInfo rtInfo, + Tolerance fileSpecificTol, ChromatographicPeak donorPeak, - Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific - ) + Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific) { // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; @@ -1097,11 +923,11 @@ Dictionary>> matc for (int j = 0; j < ms1ScanInfos.Length; j++) { Ms1ScanInfo scan = ms1ScanInfos[j]; - if (scan.RetentionTime <= lowerRangeRtHypothesis) + if (scan.RetentionTime <= rtInfo.RtStartHypothesis) { start = scan; } - if (scan.RetentionTime >= upperRangeRtHypothesis) + if (scan.RetentionTime >= rtInfo.RtEndHypothesis) { end = scan; break; @@ -1117,6 +943,7 @@ Dictionary>> matc } Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); + Normal rtScoringDistribution = new Normal(rtInfo.PredictedRt, rtInfo.Width / 6); // Grab the retention time of a random peptide in the donor file // If it is not outside of the rtInfo.predictedRT +/- rtInfo.range (twice the width of actual window), redraw @@ -1127,7 +954,7 @@ Dictionary>> matc for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) { - IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(donorIdentification.PeakfindingMass, j, mbrTol, z); + IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(donorIdentification.PeakfindingMass, j, fileSpecificTol, z); if (peak != null) chargeXic.Add(peak); } @@ -1140,9 +967,7 @@ Dictionary>> matc // remove the clustered isotopic envelopes from the list of seeds after each iteration while (chargeEnvelopes.Any()) { - ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, apexToAcceptorFilePeak, ppmDistribution, - mbrTol, medianAcceptorLogIntensity, intensityDistribution, foldChangeDistribution, rtInfo, - acceptorFileRtHypothesis, rtScoringDistribution, donorIdentification, z, chargeEnvelopes); + ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, scorer, donorPeak, fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes); if (acceptorPeak == null) continue; @@ -1184,42 +1009,31 @@ Dictionary>> matc /// remove the isotopic envelope from chargeEnvelopes afterward. /// /// - /// - /// /// - /// - /// - /// /// - /// /// - /// /// /// /// An acceptor chromatographic peak, unless the peak found was already linked to an MS/MS id, in which case it return null. internal ChromatographicPeak FindAcceptorPeak( SpectraFileInfo idAcceptorFile, - Dictionary apexToAcceptorFilePeak, - Normal ppmDistribution, + MbrScorer scorer, + ChromatographicPeak donorPeak, Tolerance mbrTol, - double medianAcceptorLogIntensity, - Normal intensityDistribution, - Normal foldChangeDistribution, RtInfo rtInfo, - double acceptorFileRtHypothesis, Normal rtScoringDistribution, - Identification donorIdentification, int z, List chargeEnvelopes) { - var acceptorPeak = new ChromatographicPeak(donorIdentification, true, idAcceptorFile); + var donorId = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile); IsotopicEnvelope seedEnv = chargeEnvelopes.First(); - var xic = Peakfind(seedEnv.IndexedPeak.RetentionTime, donorIdentification.PeakfindingMass, z, idAcceptorFile, mbrTol); - List bestChargeEnvelopes = GetIsotopicEnvelopes(xic, donorIdentification, z); + var xic = Peakfind(seedEnv.IndexedPeak.RetentionTime, donorId.PeakfindingMass, z, idAcceptorFile, mbrTol); + List bestChargeEnvelopes = GetIsotopicEnvelopes(xic, donorId, z); acceptorPeak.IsotopicEnvelopes.AddRange(bestChargeEnvelopes); acceptorPeak.CalculateIntensityForThisFeature(Integrate); - acceptorPeak.SetRtWindow(acceptorFileRtHypothesis, rtInfo.RtSd, rtInfo.RtInterquartileRange); + acceptorPeak.SetRtWindow(rtInfo.PredictedRt, rtInfo.RtSd, rtInfo.RtInterquartileRange); CutPeak(acceptorPeak, seedEnv.IndexedPeak.RetentionTime); @@ -1229,40 +1043,17 @@ internal ChromatographicPeak FindAcceptorPeak( chargeEnvelopes.RemoveAll(p => claimedPeaks.Contains(p.IndexedPeak)); // peak has already been identified by MSMS - skip it - if (apexToAcceptorFilePeak.ContainsKey(seedEnv.IndexedPeak)) + if (scorer.ApexToAcceptorFilePeakDict.ContainsKey(seedEnv.IndexedPeak)) { return null; } - // score the peak hypothesis - double rtScore = rtScoringDistribution.Density(acceptorPeak.Apex.IndexedPeak.RetentionTime); - double ppmScore = ppmDistribution.Density(acceptorPeak.MassError); - double intensityScore = 0; - - double logIntensity = Math.Log(acceptorPeak.Intensity, 2); - - if (foldChangeDistribution != null) - { - intensityScore = foldChangeDistribution.Density(logIntensity); - } - else - { - if (logIntensity < medianAcceptorLogIntensity) - { - intensityScore = intensityDistribution.Density(logIntensity); - } - else - { - intensityScore = intensityDistribution.Density(intensityDistribution.Mode); - } - } - - rtScore = Math.Log(rtScore + Math.Sqrt(Math.Pow(rtScore, 2) + 1)); - ppmScore = Math.Log(ppmScore + Math.Sqrt(Math.Pow(ppmScore, 2) + 1)); - intensityScore = Math.Log(intensityScore + Math.Sqrt(Math.Pow(intensityScore, 2) + 1)); + acceptorPeak.MbrScore = scorer.ScoreMbr(rtScoringDistribution, + retentionTime: acceptorPeak.Apex.IndexedPeak.RetentionTime, + ppmError: acceptorPeak.MassError, + acceptorIntensity: acceptorPeak.Intensity, + donorPeak); - // larger scores are better - acceptorPeak.MbrScore = (rtScore + ppmScore + intensityScore) * (1 - donorIdentification.PosteriorErrorProbability); return acceptorPeak; } diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index b4b6b4ccb..05f03da1a 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -1,4 +1,5 @@ using MathNet.Numerics.Distributions; +using MathNet.Numerics.Statistics; using System; using System.Collections.Generic; using System.Linq; @@ -20,15 +21,22 @@ internal class MbrScorer private Dictionary _logFcDistributionDictionary; internal Dictionary ApexToAcceptorFilePeakDict { get; } + internal List UnambiguousMsMsAcceptorPeaks { get; } + /// /// Takes in an intensity distribution, a log foldchange distribution, and a ppm distribution /// unique to each donor file - acceptor file pair. These are used to score MBR matches /// - internal MbrScorer(Dictionary apexToAcceptorFilePeakDict, - Normal ppmDistribution, Normal logIntensityDistribution) + internal MbrScorer( + Dictionary apexToAcceptorFilePeakDict, + List acceptorPeaks, + Normal ppmDistribution, + Normal logIntensityDistribution) { - _logIntensityDistribution = intensityDistribution; + ApexToAcceptorFilePeakDict = apexToAcceptorFilePeakDict; + UnambiguousMsMsAcceptorPeaks = acceptorPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); + _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; _logFcDistributionDictionary = new(); } @@ -58,9 +66,64 @@ internal double ScoreMbr(Normal rtDistribution, double retentionTime, double ppm double ppmScore = DensityScoreConversion(_ppmDistribution.Density(ppmError)); double rtScore = DensityScoreConversion(rtDistribution.Density(retentionTime)); - return ppmScore + rtScore + intensityScore; + double donorIdPEP = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First().PosteriorErrorProbability; + + return (ppmScore + rtScore + intensityScore) * (1 - donorIdPEP); } + /// + /// Find the difference in peptide intensities between donor and acceptor files + /// this intensity score creates a conservative bias in MBR + /// + /// List of peaks in the donoro file. + internal void CalculateFoldChangeBetweenFiles(List idDonorPeaks) + { + + var donorFileLogIntensities = idDonorPeaks.Where(p => p.Intensity > 0).Select(p => Math.Log(p.Intensity, 2)).ToList(); + double medianDonorLogIntensity = donorFileLogIntensities.Median(); + + // Find the difference in peptide intensities between donor and acceptor files + // this intensity score creates a conservative bias in MBR + List listOfFoldChangesBetweenTheFiles = new List(); + var acceptorFileBestMsmsPeaks = new Dictionary(); + + // get the best (most intense) peak for each peptide in the acceptor file + foreach (ChromatographicPeak acceptorPeak in UnambiguousMsMsAcceptorPeaks) + { + if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) + { + if (currentBestPeak.Intensity > acceptorPeak.Intensity) + { + acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; + } + } + else + { + acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); + } + } + + foreach (var donorPeak in idDonorPeaks) + { + double donorPeakIntensity = donorPeak.Intensity; + if (acceptorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out var acceptorPeak)) + { + double acceptorPeakIntensity = acceptorPeak.Intensity; + + double intensityLogFoldChange = Math.Log(acceptorPeakIntensity, 2) - Math.Log(donorPeakIntensity, 2); + + listOfFoldChangesBetweenTheFiles.Add(intensityLogFoldChange); + } + } + Normal foldChangeDistribution = listOfFoldChangesBetweenTheFiles.Count > 100 + ? new Normal(listOfFoldChangesBetweenTheFiles.Median(), listOfFoldChangesBetweenTheFiles.StandardDeviation()) + : null; + + if (foldChangeDistribution != null) + { + _logFcDistributionDictionary.Add(idDonorPeaks.First().SpectraFileInfo, foldChangeDistribution); + } + } /// /// Takes in the density of a normal distribution at a given point, and transforms it /// by taking the log of the density plus the square root of the squared density plus one diff --git a/mzLib/FlashLFQ/RtInfo.cs b/mzLib/FlashLFQ/RtInfo.cs index 1b8614f6a..74c4e7b91 100644 --- a/mzLib/FlashLFQ/RtInfo.cs +++ b/mzLib/FlashLFQ/RtInfo.cs @@ -6,14 +6,16 @@ namespace FlashLFQ { - internal class RtInfo + public class RtInfo { public double PredictedRt { get; } public double Width { get; } public double? RtSd { get; } public double? RtInterquartileRange { get; } + public double RtStartHypothesis => PredictedRt - (Width / 2.0); + public double RtEndHypothesis => PredictedRt + (Width / 2.0); - internal RtInfo(double predictedRt, double width, double? rtSd, double? rtInterquartileRange) + public RtInfo(double predictedRt, double width, double? rtSd, double? rtInterquartileRange) { PredictedRt = predictedRt; Width = width; From 15efaac573cb7f284ed7336b05cea9f530ed6f7c Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 30 Jan 2024 14:56:36 -0600 Subject: [PATCH 07/55] Deleted unused code --- mzLib/FlashLFQ/FlashLfqEngine.cs | 235 +-------------------- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 249 ----------------------- 2 files changed, 2 insertions(+), 482 deletions(-) delete mode 100644 mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 6d7481de0..110f358a6 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -791,121 +791,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } RunErrorChecking(idAcceptorFile); - - //MbrDecoySearch(); - } - - internal void MbrDecoySearch(SpectraFileInfo acceptorFile) - { - var decoyPeptides = new List(); - ChromatographicPeak donorPeak = null; - PpmTolerance mbrTol = new PpmTolerance(10); - - // Should do alignment with like 100 anchor peptides, build a spline, then use that to inform the - // rt hypothesis - - // get the MS1 scan info for this region so we can look up indexed peaks - Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[acceptorFile]; - Ms1ScanInfo start = ms1ScanInfos[0]; - Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; - for (int j = 0; j < ms1ScanInfos.Length; j++) - { - Ms1ScanInfo scan = ms1ScanInfos[j]; - //if (scan.RetentionTime <= lowerRangeRtHypothesis) - start = scan; - //if (scan.RetentionTime >= upperRangeRtHypothesis) - //{ - // end = scan; - // break; - //} - } - - // now we've identified the region in the chromatography this analyte should appear. - // we need to check for peaks in the region using ppm tolerance and isotope pattern matching - var chargesToMatch = donorPeak.Identifications.Select(p => p.PrecursorChargeState).Distinct().ToList(); - if (!chargesToMatch.Contains(donorPeak.Apex.ChargeState)) - { - chargesToMatch.Add(donorPeak.Apex.ChargeState); - } - - Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - - // TODO: For decoys, need to increase ppm tolerance until something is found or a maximum is reached - // Decoys, just do one charge state - foreach (int z in chargesToMatch) - { - List chargeXic = new List(); - - double adjustment = mbrTol.GetRange(donorIdentification.PeakfindingMass).Width; - - //peakfinding loop - // for every loop after first, adjust target such that it sits in the middle of the tolerance range - // below or above the previous target range. - // e.g., 0, -1, 1, -2, 2 - - - for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) - { - IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(donorIdentification.PeakfindingMass, j, mbrTol, z); - if (peak != null) - chargeXic.Add(peak); - } - if (!chargeXic.Any()) - continue; // goto peakfinidng loop - - List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z); - - // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. - // remove the clustered isotopic envelopes from the list of seeds after each iteration - // while (chargeEnvelopes.Any()) - // { - // ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, apexToAcceptorFilePeak, ppmDistribution, mbrTol, medianAcceptorLogIntensity, intensityDistribution, foldChangeDistribution, rtInfo, acceptorFileRtHypothesis, rtScoringDistribution, donorIdentification, z, chargeEnvelopes); - // if (acceptorPeak == null) - // continue; - - // // save the peak hypothesis - // // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak - // if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) - // { - // if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) - // { - // var samePeakSameSequence = existing - // .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - // if (samePeakSameSequence != null) - // { - // samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; - // samePeakSameSequence.Identifications.Add(donorIdentification); - // } - // else - // { - // existing.Add(acceptorPeak); - // } - // } - // else - // { - // mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); - // } - // } - // else - // { - // matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); - // matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); - // } - // } - } - - - //run once for each acceptor file - // for each decoy peptide - // get rt hypothesis + range - // get all MS1 scans in range (i.e., find indices - // pick a charge state where decoy would have reasonable mass (350 < m/z < 1600) - // peak pick - // loop where tolerance gets progressively larger until something is found - // find isotopic envelopes with reduced requirement for pearson correlation - // do need some maximum tolerance value. 10k ppm? - // report + store these peaks } internal void FindAllAcceptorPeaks( @@ -967,7 +852,7 @@ internal void FindAllAcceptorPeaks( // remove the clustered isotopic envelopes from the list of seeds after each iteration while (chargeEnvelopes.Any()) { - ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, scorer, donorPeak, fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes); + ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes); if (acceptorPeak == null) continue; @@ -1015,7 +900,7 @@ internal void FindAllAcceptorPeaks( /// /// /// An acceptor chromatographic peak, unless the peak found was already linked to an MS/MS id, in which case it return null. - internal ChromatographicPeak FindAcceptorPeak( + internal ChromatographicPeak FindIndividualAcceptorPeak( SpectraFileInfo idAcceptorFile, MbrScorer scorer, ChromatographicPeak donorPeak, @@ -1057,122 +942,6 @@ internal ChromatographicPeak FindAcceptorPeak( return acceptorPeak; } - internal ChromatographicPeak FindDecoyPeak( - SpectraFileInfo idAcceptorFile, - Dictionary apexToAcceptorFilePeak, - Tolerance mbrTol, - (double predictedRt, double range, double? rtSd, double? rtInterquartileRange) rtInfo, - Identification donorIdentification) - { - // this is the RT in the acceptor file to look around to find this analyte - double acceptorFileRtHypothesis = rtInfo.predictedRt; - double lowerRangeRtHypothesis = acceptorFileRtHypothesis - (rtInfo.range / 2.0); - double upperRangeRtHypothesis = acceptorFileRtHypothesis + (rtInfo.range / 2.0); - - // get the MS1 scan info for this region so we can look up indexed peaks - Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; - Ms1ScanInfo start = ms1ScanInfos[0]; - Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; - for (int j = 0; j < ms1ScanInfos.Length; j++) - { - Ms1ScanInfo scan = ms1ScanInfos[j]; - if (scan.RetentionTime <= lowerRangeRtHypothesis) - start = scan; - if (scan.RetentionTime >= upperRangeRtHypothesis) - { - end = scan; - break; - } - } - - // Checking charge states 2 - 4 - List chargeStateAcceptors = new(); - for(int z = 2; z <= 4; z++) - { - int searchCount = 0; - List acceptorPeakCandidates = new(); - while (acceptorPeakCandidates.Count < 1) - { - if (searchCount > 1000) break; - - // Select the peakFindingMass that will be used for this round of search - int searchRangeCoeff = (searchCount + 2 - 1) / 2; // Integer division that returns ceiling: https://stackoverflow.com/questions/17944/how-to-round-up-the-result-of-integer-division - searchRangeCoeff = searchCount % 2 == 0 ? searchRangeCoeff : -1 * searchRangeCoeff; - double peakFindingMass = donorIdentification.PeakfindingMass - + mbrTol.GetRange(donorIdentification.PeakfindingMass.ToMz(z)).Width * searchRangeCoeff; - - // Pull every imsPeak in the given time range for the peak finding mass - List fullRangeXic = new List(); - for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) - { - IndexedMassSpectralPeak peak = _peakIndexingEngine.GetIndexedPeak(peakFindingMass, j, mbrTol, z); - if (peak != null) - fullRangeXic.Add(peak); - } - if (!fullRangeXic.Any()) - { - searchCount++; - continue; - } - - // Find peaks in the XIC - List seedEnvelopes = GetIsotopicEnvelopes(fullRangeXic, donorIdentification, z); - if (!seedEnvelopes.Any()) - { - searchCount++; - continue; - } - while (seedEnvelopes.Any()) - { - IsotopicEnvelope seedEnvelope = seedEnvelopes.First(); - // peak has already been identified by MSMS - skip it - if (apexToAcceptorFilePeak.ContainsKey(seedEnvelope.IndexedPeak)) - { - seedEnvelopes.Remove(seedEnvelope); - continue; - } - - var xic = Peakfind(seedEnvelope.IndexedPeak.RetentionTime, peakFindingMass, z, idAcceptorFile, mbrTol); - List localEnvelopes = GetIsotopicEnvelopes(xic, donorIdentification, z); - if (localEnvelopes.Count() < 3) - { - seedEnvelopes.Remove(seedEnvelope); - continue; - } - ChromatographicPeak acceptorPeak = new ChromatographicPeak(donorIdentification, true, idAcceptorFile); - acceptorPeak.IsotopicEnvelopes.AddRange(localEnvelopes); - acceptorPeak.CalculateIntensityForThisFeature(Integrate); - //acceptorPeak.SetRtWindow(acceptorFileRtHypothesis, rtInfo.rtSd, rtInfo.rtInterquartileRange); - - CutPeak(acceptorPeak, seedEnvelope.IndexedPeak.RetentionTime); - acceptorPeakCandidates.Add(acceptorPeak); - - var claimedPeaks = new HashSet(acceptorPeak.IsotopicEnvelopes.Select(p => p.IndexedPeak)); - if (claimedPeaks.Max(p => p.ZeroBasedMs1ScanIndex) < seedEnvelopes.Min(e => e.IndexedPeak.ZeroBasedMs1ScanIndex)) - break; - seedEnvelopes.Remove(seedEnvelope); - seedEnvelopes.RemoveAll(p => claimedPeaks.Contains(p.IndexedPeak)); - } - searchCount++; - } - - if (!acceptorPeakCandidates.Any()) - continue; - - // Best peak is selected with a shitty heuristic, summing the isotopic correlation of all envelopes - ChromatographicPeak bestPeak = acceptorPeakCandidates.MinBy(peak => - Math.Abs(peak.Identifications.First().PeakfindingMass - peak.Apex.IndexedPeak.Mz.ToMass(peak.Apex.ChargeState))); - chargeStateAcceptors.Add(bestPeak); - } - - if (!chargeStateAcceptors.Any()) return null; - - ChromatographicPeak bestOverallPeak = chargeStateAcceptors.MinBy(peak => - Math.Abs(peak.Identifications.First().PeakfindingMass - peak.Apex.IndexedPeak.Mz.ToMass(peak.Apex.ChargeState))); - bestOverallPeak.MbrScore = -1; - return bestOverallPeak; - } - /// /// Used by the match-between-runs algorithm to determine systematic retention time drifts between /// chromatographic runs. diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs deleted file mode 100644 index b8ce7f152..000000000 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ /dev/null @@ -1,249 +0,0 @@ -using Chemistry; -using FlashLFQ; -using MassSpectrometry; -using MathNet.Numerics.Distributions; -using MathNet.Numerics.Statistics; -using MzLibUtil; -using NUnit.Framework; -using Proteomics.AminoAcidPolymer; -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using Easy.Common.Extensions; -using Test.FileReadingTests; -using UsefulProteomicsDatabases; -using ChromatographicPeak = FlashLFQ.ChromatographicPeak; -using Stopwatch = System.Diagnostics.Stopwatch; -using Peptide = Proteomics.AminoAcidPolymer.Peptide; - - -namespace Test -{ - [TestFixture] - [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] - internal class MbrTargetDecoyTest - { - [Test] - [TestCase(0, ExpectedResult = 0)] - [TestCase(1, ExpectedResult = -1)] - [TestCase(2, ExpectedResult = 1)] - [TestCase(3, ExpectedResult = -2)] - [TestCase(5, ExpectedResult = -3)] - [TestCase(6, ExpectedResult = 3)] - public static int TestDecoySearchFlipFlop(int searchCount) - { - // Integer division take ceiling: https://stackoverflow.com/questions/17944/how-to-round-up-the-result-of-integer-division - int result = (searchCount + 2 - 1) / 2; - result = searchCount % 2 == 0 ? result : -1 * result; - - return result; - } - - [Test] - public static void DecoyPeakFindTrial() - { - string decoyPeptidePath = @"C:\Users\Alex\Source\Repos\chronologer\chronologer-main(noChange)\Arabidopsis_Tryptic_Peptides_Slice.tsv"; - string spectraFilePath = @"D:\SingleCellDataSets\Organoid\Calibration_MM_320\Task1-CalibrateTask\HFL1SC_Unhealthy_CH2_J5-calib.mzML"; - ProteinGroup pg = new ProteinGroup("xyz", "x", "z"); - List pgs = new List { pg }; - SpectraFileInfo j5 = new SpectraFileInfo(spectraFilePath, "A", 1, 1, 1); - double rtRange = 4.0; - - List decoys = new(); - Loaders.LoadElements(); - - using (StreamReader reader = new StreamReader(decoyPeptidePath)) - { - reader.ReadLine(); - while(!reader.EndOfStream) - { - string[] lineSplit = reader.ReadLine().Split('\t'); - if (double.Parse(lineSplit[4]) > 57.5) continue; - - Peptide peptide = new Peptide(sequence: lineSplit[0]); - Identification decoyId = new Identification(j5, peptide.BaseSequence, peptide.BaseSequence, - peptide.MonoisotopicMass, ms2RetentionTimeInMinutes: double.Parse(lineSplit[4]), - chargeState: peptide.MonoisotopicMass > 1600 ? 3 : 2, pgs); - decoys.Add(decoyId); - } - } - - - FlashLfqEngine engine = new FlashLfqEngine( - decoys - ); - - engine.CalculateTheoreticalIsotopeDistributions(); - engine._ms1Scans = new Dictionary(); - engine._peakIndexingEngine.IndexMassSpectralPeaks(j5, true, engine._ms1Scans); - - - List decoyPeaks = new(); - List massDifferences = new(); - Dictionary apexPeakDict = new(); - int decoysConsidered = 0; - Random rnd = new Random(); - foreach (Identification decoy in decoys) - { - - int rndInt = rnd.Next(1, 13); - // Eliminate ~ half of the decoys with mass greater than 2000 daltons - // This is an ad-hoc way of matching target and decoy mass distribution - if (decoy.PeakfindingMass > 1800 && rndInt % 2 == 0) continue; - else if (decoy.PeakfindingMass > 1400 && rndInt % 3 == 0) continue; - - if (decoy.Ms2RetentionTimeInMinutes < 8 && rndInt < 11) continue; - - PpmTolerance tolerance = new PpmTolerance(10); - var foundPeak = engine.FindDecoyPeak( - j5, - apexPeakDict, - tolerance, - (decoy.Ms2RetentionTimeInMinutes, rtRange, null, null), - decoy); - - if (foundPeak != null) - { - decoyPeaks.Add(foundPeak); - massDifferences.Add( - Math.Abs( - decoy.PeakfindingMass - foundPeak.Apex.IndexedPeak.Mz.ToMass(foundPeak.Apex.ChargeState) - )); - } - - decoysConsidered++; - if (decoyPeaks.Count >= 750) break; - } - - int placeholder = 0; - - double massDiffMean = massDifferences.Select(m => Math.Abs(m)).Average(); - double envelopeCountMean = decoyPeaks.Select(peak => peak.IsotopicEnvelopes.Count).Average(); - double intensityMean = decoyPeaks.Select(peak => peak.IsotopicEnvelopes.Select(e => e.Intensity).Average()).Average(); - - placeholder = 1; - - // Repeat, but for targets - string targetPeptidePath = @"C:\Users\Alex\Source\Repos\chronologer\chronologer-main(noChange)\Unhealthy_CH2_J5_MBR_Predicted.tsv"; - //For MBR Predicted file - int fullSeqCol = 3; - int massCol = 5; - int rtColumn = 24; - - List targetIDs = new(); - using (StreamReader reader = new StreamReader(targetPeptidePath)) - { - reader.ReadLine(); - while (!reader.EndOfStream) - { - string[] lineSplit = reader.ReadLine().Split('\t'); - if (lineSplit[fullSeqCol].Contains('[')) continue; - if (double.Parse(lineSplit[rtColumn]) > 60) continue; - - - Peptide peptide = new Peptide(sequence: lineSplit[fullSeqCol]); - Identification targetId = new Identification(j5, peptide.BaseSequence, peptide.BaseSequence, - peptide.MonoisotopicMass, ms2RetentionTimeInMinutes: double.Parse(lineSplit[rtColumn]), - chargeState: peptide.MonoisotopicMass > 1600 ? 3 : 2, pgs); - targetIDs.Add(targetId); - } - } - - - engine = new FlashLfqEngine( - targetIDs - ); - - engine.CalculateTheoreticalIsotopeDistributions(); - engine._ms1Scans = new Dictionary(); - engine._peakIndexingEngine.IndexMassSpectralPeaks(j5, true, engine._ms1Scans); - - - List targetPeaks = new(); - List massDifferencesTarget = new(); - Dictionary apexPeakDictTarget = new(); - int targetsConsidered = 0; - foreach (Identification target in targetIDs) - { - PpmTolerance tolerance = new PpmTolerance(10); - var foundPeak = engine.FindDecoyPeak( - j5, - apexPeakDictTarget, - tolerance, - (target.Ms2RetentionTimeInMinutes, rtRange, null, null), - target); - - if (foundPeak != null) - { - targetPeaks.Add(foundPeak); - massDifferencesTarget.Add( - Math.Abs( - target.PeakfindingMass - foundPeak.Apex.IndexedPeak.Mz.ToMass(foundPeak.Apex.ChargeState) - )); - } - - targetsConsidered++; - if (targetPeaks.Count >= 750) break; - } - - double massDiffMeanT = massDifferencesTarget.Select(m => Math.Abs(m)).Average(); - double envelopeCountMeanT = targetPeaks.Select(peak => peak.IsotopicEnvelopes.Count).Average(); - double intensityMeanT = targetPeaks.Select(peak => peak.IsotopicEnvelopes.Select(e => e.Intensity).Average()).Average(); - - placeholder = 2; - - using(StreamWriter writer = new StreamWriter(@"C:\Users\Alex\Desktop\MBR_10_30\Take8_MbrTargetRT_4MinWindow.tsv")) - { - string[] header = new string[] - { - "Sequence", - "Target/Decoy", - "Theoretical Peak Finding Mass", - "Found Mass", - "Number of Scans", - "Apex Intensity", - "Retention Time", - "Predicted Retention Time" - }; - writer.WriteLine(string.Join('\t', header)); - - foreach (var decoy in decoyPeaks) - { - double peakFindingMass = decoy.Identifications.First().PeakfindingMass; - header = new string[] - { - decoy.Identifications.First().BaseSequence, - "D", - decoy.Identifications.First().PeakfindingMass.ToString(), - decoy.Apex.IndexedPeak.Mz.ToMass(decoy.Apex.ChargeState).ToString(), - decoy.IsotopicEnvelopes.Count.ToString(), - decoy.Apex.Intensity.ToString(), - decoy.Apex.IndexedPeak.RetentionTime.ToString(), - decoy.Identifications.First().Ms2RetentionTimeInMinutes.ToString() - }; - writer.WriteLine(string.Join('\t', header)); - } - - foreach (var target in targetPeaks) - { - double peakFindingMass = target.Identifications.First().PeakfindingMass; - header = new string[] - { - target.Identifications.First().BaseSequence, - "T", - target.Identifications.First().PeakfindingMass.ToString(), - target.Apex.IndexedPeak.Mz.ToMass(target.Apex.ChargeState).ToString(), - target.IsotopicEnvelopes.Count.ToString(), - target.Apex.Intensity.ToString(), - target.Apex.IndexedPeak.RetentionTime.ToString(), - target.Identifications.First().Ms2RetentionTimeInMinutes.ToString() - }; - writer.WriteLine(string.Join('\t', header)); - } - } - - } - - } -} From 48239cc692efb1841bb0e73ec19bf0ba88b0eecb Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 30 Jan 2024 18:45:53 -0600 Subject: [PATCH 08/55] got those decoys decoying naw mean --- mzLib/FlashLFQ/ChromatographicPeak.cs | 6 ++- mzLib/FlashLFQ/FlashLfqEngine.cs | 75 +++++++++++++++++++++------ mzLib/TestFlashLFQ/TestFlashLFQ.cs | 2 +- 3 files changed, 64 insertions(+), 19 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index a8cc1de95..1d7749392 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -16,7 +16,7 @@ public class ChromatographicPeak public readonly bool IsMbrPeak; public double MbrScore; - public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo) + public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool decoyPeak = false) { SplitRT = 0; NumChargeStatesObserved = 0; @@ -27,6 +27,7 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi IsotopicEnvelopes = new List(); IsMbrPeak = isMbrPeak; SpectraFileInfo = fileInfo; + DecoyPeak = decoyPeak; } public IsotopicEnvelope Apex { get; private set; } @@ -47,6 +48,7 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi /// Interquartile range of retention time differences between MBR acceptor file and donor file, used if # calibration points >= 6 /// public double? RtInterquartileRange { get; private set; } + public bool DecoyPeak { get; } public static string TabSeparatedHeader { @@ -75,6 +77,7 @@ public static string TabSeparatedHeader sb.Append("Full Sequences Mapped" + "\t"); sb.Append("Peak Split Valley RT" + "\t"); sb.Append("Peak Apex Mass Error (ppm)"); + sb.Append("\t" + "Decoy Peak"); //sb.Append("Timepoints"); return sb.ToString(); } @@ -249,6 +252,7 @@ public override string ToString() sb.Append("" + NumIdentificationsByFullSeq + "\t"); sb.Append("" + SplitRT + "\t"); sb.Append("" + MassError); + sb.Append("\t" + DecoyPeak); return sb.ToString(); } diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 6d7481de0..533c5664b 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -15,6 +15,7 @@ using System.Threading.Tasks; using UsefulProteomicsDatabases; using System.Runtime.CompilerServices; +using System.IO; [assembly: InternalsVisibleTo("TestFlashLFQ")] @@ -656,7 +657,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } // this stores the results of MBR - var matchBetweenRunsIdentifiedPeaks = new Dictionary>>(); + Dictionary>> matchBetweenRunsIdentifiedPeaks = new(); + Random randomGenerator = new Random(); + Dictionary acceptorPeakDecoyPeakDict = new(); // map each donor file onto this file foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) @@ -711,20 +714,35 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (rtInfo == null) continue; - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific); + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific, out var bestAcceptor); + + // Draw a random donor that has an rt sufficiently far enough away + var randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; + while(Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < rtInfo.Width*1.25) // multiply for safety, in case the relative rt shifts after alignment + { + randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; + } + // Map the random rt onto the new file + RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + // Find a decoy peak using the randomly drawn retention time + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific, out var bestDecoy, decoyRt:decoyRtInfo.PredictedRt); + acceptorPeakDecoyPeakDict.TryAdd(donorPeak.Identifications.First(), (bestAcceptor, bestDecoy)); } + // Each isotopic envelope is linked to a list of ChromatographicPeaks + // If multiple chromatographic peaks are linked, each with the same peptide identification, then their mbr scores are summed + // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications, then they're kept separate. lock (matchBetweenRunsIdentifiedPeaks) { foreach (var kvp in matchBetweenRunsIdentifiedPeaksThreadSpecific) { - if (matchBetweenRunsIdentifiedPeaks.TryGetValue(kvp.Key, out var list)) + if (matchBetweenRunsIdentifiedPeaks.TryGetValue(kvp.Key, out var envelopePeakListDict)) { - foreach (var peak in kvp.Value) + foreach (var envelopePeakListKvp in kvp.Value) { - if (list.TryGetValue(peak.Key, out List existing)) + if (envelopePeakListDict.TryGetValue(envelopePeakListKvp.Key, out List existing)) { - foreach (var acceptorPeak in peak.Value) + foreach (var acceptorPeak in envelopePeakListKvp.Value) { var samePeakSameSequence = existing .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); @@ -742,7 +760,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } else { - list.Add(peak.Key, peak.Value); + envelopePeakListDict.Add(envelopePeakListKvp.Key, envelopePeakListKvp.Value); } } } @@ -755,6 +773,20 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) }); } + var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); + using(StreamWriter writer = new StreamWriter(@"C:\Users\Alex\Desktop\MbrTargetDecoyJan30.tsv")) + { + writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + foreach(var pair in test) + { + if (pair.Value.target != null) + writer.WriteLine(pair.Value.target.ToString()); + else + writer.WriteLine(""); + writer.WriteLine(pair.Value.decoy.ToString()); + } + } + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -914,20 +946,25 @@ internal void FindAllAcceptorPeaks( RtInfo rtInfo, Tolerance fileSpecificTol, ChromatographicPeak donorPeak, - Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific) + Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific, + out ChromatographicPeak bestAcceptor, + double? decoyRt = null) { // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + double rtStartHypothesis = decoyRt == null ? rtInfo.RtStartHypothesis : (double)decoyRt - (rtInfo.Width / 2.0); + double rtEndHypothesis = decoyRt == null ? rtInfo.RtEndHypothesis : (double)decoyRt + (rtInfo.Width / 2.0); + for (int j = 0; j < ms1ScanInfos.Length; j++) { Ms1ScanInfo scan = ms1ScanInfos[j]; - if (scan.RetentionTime <= rtInfo.RtStartHypothesis) + if (scan.RetentionTime <= rtStartHypothesis) { start = scan; } - if (scan.RetentionTime >= rtInfo.RtEndHypothesis) + if (scan.RetentionTime >= rtEndHypothesis) { end = scan; break; @@ -943,10 +980,8 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - Normal rtScoringDistribution = new Normal(rtInfo.PredictedRt, rtInfo.Width / 6); - - // Grab the retention time of a random peptide in the donor file - // If it is not outside of the rtInfo.predictedRT +/- rtInfo.range (twice the width of actual window), redraw + Normal rtScoringDistribution = decoyRt == null ? new Normal(rtInfo.PredictedRt, rtInfo.Width / 6) : new Normal((double)decoyRt, rtInfo.Width / 6); + bestAcceptor = null; foreach (int z in chargesToMatch) { @@ -967,9 +1002,14 @@ internal void FindAllAcceptorPeaks( // remove the clustered isotopic envelopes from the list of seeds after each iteration while (chargeEnvelopes.Any()) { - ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, scorer, donorPeak, fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes); + ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, scorer, donorPeak, + fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes, isDecoy: decoyRt != null); if (acceptorPeak == null) continue; + if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) + bestAcceptor = acceptorPeak; + if (decoyRt != null) + continue; // We don't want to store the decoys in mbrIdentifiedPeaks right now // save the peak hypothesis // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak @@ -1023,10 +1063,11 @@ internal ChromatographicPeak FindAcceptorPeak( RtInfo rtInfo, Normal rtScoringDistribution, int z, - List chargeEnvelopes) + List chargeEnvelopes, + bool isDecoy = false) { var donorId = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, isDecoy); IsotopicEnvelope seedEnv = chargeEnvelopes.First(); var xic = Peakfind(seedEnv.IndexedPeak.RetentionTime, donorId.PeakfindingMass, z, idAcceptorFile, mbrTol); diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 787b2800b..8994b6996 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -1356,7 +1356,7 @@ public static void RealDataMbrTest() } double corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.That(corr > 0.8); + Assert.Greater(corr, 0.8); peptideIntensities.Clear(); foreach (var peptide in f1r2MbrResults) From add08ab35c5d26ec66e253048ab8417aa749d92f Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 31 Jan 2024 14:05:10 -0600 Subject: [PATCH 09/55] refactore MBR to use concurrent dictionaries --- mzLib/FlashLFQ/FlashLfqEngine.cs | 185 +++++++++++++++++-------------- mzLib/FlashLFQ/MbrScorer.cs | 12 +- 2 files changed, 112 insertions(+), 85 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 533c5664b..afe4659e3 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -657,7 +657,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } // this stores the results of MBR - Dictionary>> matchBetweenRunsIdentifiedPeaks = new(); + ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); Random randomGenerator = new Random(); Dictionary acceptorPeakDecoyPeakDict = new(); @@ -714,78 +714,72 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (rtInfo == null) continue; - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific, out var bestAcceptor); + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestAcceptor); // Draw a random donor that has an rt sufficiently far enough away var randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; - while(Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < rtInfo.Width*1.25) // multiply for safety, in case the relative rt shifts after alignment + int randomPeaksSampled = 0; + while(randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence + || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < rtInfo.Width*1.25) // multiply for safety, in case the relative rt shifts after alignment { randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; + if (randomPeaksSampled++ > (idDonorPeaks.Count - 1)) + { + randomDonor = null; + break; // Prevent infinite loops + } } + if (randomDonor == null) continue; + // Map the random rt onto the new file RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); // Find a decoy peak using the randomly drawn retention time - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific, out var bestDecoy, decoyRt:decoyRtInfo.PredictedRt); + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, decoyRt:decoyRtInfo.PredictedRt); acceptorPeakDecoyPeakDict.TryAdd(donorPeak.Identifications.First(), (bestAcceptor, bestDecoy)); } - - // Each isotopic envelope is linked to a list of ChromatographicPeaks - // If multiple chromatographic peaks are linked, each with the same peptide identification, then their mbr scores are summed - // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications, then they're kept separate. - lock (matchBetweenRunsIdentifiedPeaks) - { - foreach (var kvp in matchBetweenRunsIdentifiedPeaksThreadSpecific) - { - if (matchBetweenRunsIdentifiedPeaks.TryGetValue(kvp.Key, out var envelopePeakListDict)) - { - foreach (var envelopePeakListKvp in kvp.Value) - { - if (envelopePeakListDict.TryGetValue(envelopePeakListKvp.Key, out List existing)) - { - foreach (var acceptorPeak in envelopePeakListKvp.Value) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; - samePeakSameSequence.Identifications.Add(acceptorPeak.Identifications.First()); - } - else - { - existing.Add(acceptorPeak); - } - } - } - else - { - envelopePeakListDict.Add(envelopePeakListKvp.Key, envelopePeakListKvp.Value); - } - } - } - else - { - matchBetweenRunsIdentifiedPeaks.Add(kvp.Key, kvp.Value); - } - } - } }); } - var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); - using(StreamWriter writer = new StreamWriter(@"C:\Users\Alex\Desktop\MbrTargetDecoyJan30.tsv")) + // If we have multiple identification with the same sequence mapped to the same peak, we want to sum their MBR scores + // This is done here + foreach(var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) { - writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - foreach(var pair in test) + // Each isotopic envelope is linked to a list of ChromatographicPeaks + // If multiple chromatographic peaks are linked, each with the same peptide identification, then their mbr scores are summed + // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. + foreach (var envelopeListKvp in seqDictionaryKvp.Value) { - if (pair.Value.target != null) - writer.WriteLine(pair.Value.target.ToString()); - else - writer.WriteLine(""); - writer.WriteLine(pair.Value.decoy.ToString()); + List collapsedPeaks = new(); + foreach(var peakGroup in envelopeListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) + { + var scoreSum = peakGroup.Sum(peak => peak.MbrScore); + var idList = peakGroup.Select(peak => peak.Identifications.First()).Distinct().ToList(); // This is fine, because each mbrPeak only has one identification + var collapsedPeak = peakGroup.First(); + collapsedPeak.MbrScore = scoreSum; + // Lmao, these three lines are problematic. Should probably do something about them + collapsedPeak.Identifications.Clear(); + collapsedPeak.Identifications.AddRange(idList); + collapsedPeak.ResolveIdentifications(); + collapsedPeaks.Add(collapsedPeak); + } + envelopeListKvp.Value.Clear(); + envelopeListKvp.Value.AddRange(collapsedPeaks); } } + + //var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); + //using(StreamWriter writer = new StreamWriter(@"C:\Users\Alex\Desktop\MbrTargetDecoyJan30.tsv")) + //{ + // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + // foreach(var pair in test) + // { + // if (pair.Value.target != null) + // writer.WriteLine(pair.Value.target.ToString()); + // else + // writer.WriteLine(""); + // writer.WriteLine(pair.Value.decoy.ToString()); + // } + //} // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) @@ -946,7 +940,7 @@ internal void FindAllAcceptorPeaks( RtInfo rtInfo, Tolerance fileSpecificTol, ChromatographicPeak donorPeak, - Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific, + ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks, out ChromatographicPeak bestAcceptor, double? decoyRt = null) { @@ -1012,34 +1006,63 @@ internal void FindAllAcceptorPeaks( continue; // We don't want to store the decoys in mbrIdentifiedPeaks right now // save the peak hypothesis - // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak - if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) - { - if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; - samePeakSameSequence.Identifications.Add(donorIdentification); - } - else + matchBetweenRunsIdentifiedPeaks.AddOrUpdate + ( + // new key + key: donorIdentification.ModifiedSequence, + // if we are adding a value for the first time, we simply create a new dictionatry with one entry + addValueFactory: (sequenceKey) => + new ConcurrentDictionary>( + new Dictionary> { - existing.Add(acceptorPeak); - } - } - else + { acceptorPeak.Apex, new List { acceptorPeak } } + }), + // if the key (sequence) already exists, we have to add the new peak to the existing dictionary + updateValueFactory: (sequenceKey, envelopePeakListDict) => { - mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); + envelopePeakListDict.AddOrUpdate( + key: acceptorPeak.Apex, + addValueFactory: (envelopeKey) => new List { acceptorPeak }, // if the key (envelope) doesnt exist, just create a new list + updateValueFactory: (envelopeKey, peakList) => { peakList.Add(acceptorPeak); return peakList; }); // if the key (envelope) already exists, add the peak to the associated list + return envelopePeakListDict; } - } - else - { - matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); - matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); - } + ); + + // save the peak hypothesis + // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak + //if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var envelopePeaksKvp)) + //{ + // envelopePeaksKvp.AddOrUpdate(acceptorPeak.Apex, new List { acceptorPeak }, // just add the new key value pair if it doesnt already exist + // (key, peakList) => { peakList.Add(acceptorPeak); return peakList; }); // if key already exists, we simply add the acceptorPeak to the list + //if (envelopePeaksKvp.TryGetValue(acceptorPeak.Apex, out List existing)) + //{ + // var samePeakSameSequence = existing + // .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); + + // if (samePeakSameSequence != null) + // { + // samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; + // samePeakSameSequence.Identifications.Add(donorIdentification); + // } + // else + // { + // existing.Add(acceptorPeak); + // } + //} + //else + //{ + // envelopePeaksKvp.AddOrUpdate(acceptorPeak.Apex, new List { acceptorPeak }, // just add the new key value pair if it doesnt already exist + // (key, peakList) => { peakList.Add(acceptorPeak); return peakList; }); // if key already exists, we simply add the acceptorPeak to the list + //} + //} + //else + //{ + // if(!matchBetweenRunsIdentifiedPeaksThreadSpecific.TryAdd(donorIdentification.ModifiedSequence, new ConcurrentDictionary>())) + // { + + // } + // matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].TryAdd(acceptorPeak.Apex, new List { acceptorPeak }); + //} } } } diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 05f03da1a..f62b055dc 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -54,12 +54,16 @@ internal double ScoreMbr(Normal rtDistribution, double retentionTime, double ppm else { var logIntensity = Math.Log(acceptorIntensity, 2); + // I don't know what the if/else statement accomplishes. It feels like we should take the density regardless // As it is, the score is artifically inflated for very intense peaks - if (logIntensity < _logIntensityDistribution.Median) - intensityDensity = _logIntensityDistribution.Density(logIntensity); - else - intensityDensity = _logIntensityDistribution.Density(_logIntensityDistribution.Mode); + //if (logIntensity < _logIntensityDistribution.Median) + // intensityDensity = _logIntensityDistribution.Density(logIntensity); + //else + // intensityDensity = _logIntensityDistribution.Density(_logIntensityDistribution.Mode); + + //alternate, more straightforward approach + intensityDensity = _logIntensityDistribution.Density(logIntensity); } double intensityScore = DensityScoreConversion(intensityDensity); From 1da0e133f2099fe20651b45d97b5d7f981b95a6c Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 2 Feb 2024 10:40:16 -0600 Subject: [PATCH 10/55] Test method with output --- mzLib/FlashLFQ/ChromatographicPeak.cs | 14 +-- mzLib/FlashLFQ/FlashLfqEngine.cs | 42 +++++---- mzLib/FlashLFQ/Identification.cs | 5 +- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 109 +++++++++++++++++++++++ mzLib/TestFlashLFQ/TestFlashLFQ.cs | 4 +- 5 files changed, 150 insertions(+), 24 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 1d7749392..1efb53e37 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -16,7 +16,7 @@ public class ChromatographicPeak public readonly bool IsMbrPeak; public double MbrScore; - public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool decoyPeak = false) + public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) { SplitRT = 0; NumChargeStatesObserved = 0; @@ -27,7 +27,8 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi IsotopicEnvelopes = new List(); IsMbrPeak = isMbrPeak; SpectraFileInfo = fileInfo; - DecoyPeak = decoyPeak; + RandomRt = randomRt; + } public IsotopicEnvelope Apex { get; private set; } @@ -48,7 +49,8 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi /// Interquartile range of retention time differences between MBR acceptor file and donor file, used if # calibration points >= 6 /// public double? RtInterquartileRange { get; private set; } - public bool DecoyPeak { get; } + public bool RandomRt { get; } + public bool DecoyPeptide { get; } public static string TabSeparatedHeader { @@ -77,7 +79,8 @@ public static string TabSeparatedHeader sb.Append("Full Sequences Mapped" + "\t"); sb.Append("Peak Split Valley RT" + "\t"); sb.Append("Peak Apex Mass Error (ppm)"); - sb.Append("\t" + "Decoy Peak"); + sb.Append("\t" + "Decoy Peptide"); + sb.Append("\t" + "Random Rt"); //sb.Append("Timepoints"); return sb.ToString(); } @@ -252,7 +255,8 @@ public override string ToString() sb.Append("" + NumIdentificationsByFullSeq + "\t"); sb.Append("" + SplitRT + "\t"); sb.Append("" + MassError); - sb.Append("\t" + DecoyPeak); + sb.Append("\t" + Identifications.First().IsDecoy); + sb.Append("\t" + RandomRt); return sb.ToString(); } diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index afe4659e3..3ca25d11e 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -659,7 +659,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // this stores the results of MBR ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); Random randomGenerator = new Random(); - Dictionary acceptorPeakDecoyPeakDict = new(); + ConcurrentDictionary acceptorPeakDecoyPeakDict = new(); // map each donor file onto this file foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) @@ -705,7 +705,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) (range, loopState) => { var nearbyCalibrationPoints = new List(); - var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); + //var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); for (int i = range.Item1; i < range.Item2; i++) { @@ -767,20 +767,20 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } - //var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); - //using(StreamWriter writer = new StreamWriter(@"C:\Users\Alex\Desktop\MbrTargetDecoyJan30.tsv")) - //{ - // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - // foreach(var pair in test) - // { - // if (pair.Value.target != null) - // writer.WriteLine(pair.Value.target.ToString()); - // else - // writer.WriteLine(""); - // writer.WriteLine(pair.Value.decoy.ToString()); - // } - //} - + var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrTargetDecoyFirst.tsv")) + { + writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + foreach (var pair in test) + { + if (pair.Value.target != null) + writer.WriteLine(pair.Value.target.ToString()); + else + writer.WriteLine(""); + writer.WriteLine(pair.Value.decoy.ToString()); + } + } + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -814,6 +814,16 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } _results.Peaks[idAcceptorFile].Add(best); + + } + + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrAllMbrPeaks.tsv")) + { + writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + foreach (var peak in _results.Peaks[idAcceptorFile].Where(peak => peak.IsMbrPeak)) + { + writer.WriteLine(peak.ToString()); + } } RunErrorChecking(idAcceptorFile); diff --git a/mzLib/FlashLFQ/Identification.cs b/mzLib/FlashLFQ/Identification.cs index 59f43a1fc..6395000bf 100644 --- a/mzLib/FlashLFQ/Identification.cs +++ b/mzLib/FlashLFQ/Identification.cs @@ -16,11 +16,13 @@ public class Identification public readonly bool UseForProteinQuant; public double PeakfindingMass; public double PosteriorErrorProbability; + public bool IsDecoy { get; } public Identification(SpectraFileInfo fileInfo, string BaseSequence, string ModifiedSequence, double monoisotopicMass, double ms2RetentionTimeInMinutes, int chargeState, List proteinGroups, - ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true, double posteriorErrorProbability = 0) + ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true, double posteriorErrorProbability = 0, + bool decoy = false) { this.FileInfo = fileInfo; this.BaseSequence = BaseSequence; @@ -32,6 +34,7 @@ public Identification(SpectraFileInfo fileInfo, string BaseSequence, string Modi this.OptionalChemicalFormula = optionalChemicalFormula; UseForProteinQuant = useForProteinQuant; PosteriorErrorProbability = posteriorErrorProbability; + IsDecoy = decoy; } public override string ToString() diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index b8ce7f152..cf126e48c 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -40,6 +40,115 @@ public static int TestDecoySearchFlipFlop(int searchCount) return result; } + [Test] + // This is gonna have a bunch of local file references, just a heads up. Dont make github try and build this one + public static void TwoFileMbrTest() + { + string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\subset_psms.psmtsv"; + + SpectraFileInfo j5 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J5.raw", "a", 0, 0, 0); + SpectraFileInfo j6 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J6.raw", "a", 1, 0, 0); + + List ids = new List(); + Dictionary allProteinGroups = new Dictionary(); + foreach (string line in File.ReadAllLines(psmFile)) + { + var split = line.Split(new char[] { '\t' }); + + if (split.Contains("File Name") || string.IsNullOrWhiteSpace(line)) + { + continue; + } + + SpectraFileInfo file = null; + + if (split[0].Contains("J5")) + { + file = j5; + } + else if (split[0].Contains("J6")) + { + file = j6; + } + + string baseSequence = split[12]; + string fullSequence = split[13]; + double monoMass = double.Parse(split[22]); + double rt = double.Parse(split[2]); + int z = (int)double.Parse(split[6]); + var proteins = split[25].Split(new char[] { '|' }); + List proteinGroups = new List(); + foreach (var protein in proteins) + { + if (allProteinGroups.TryGetValue(protein, out var proteinGroup)) + { + proteinGroups.Add(proteinGroup); + } + else + { + allProteinGroups.Add(protein, new ProteinGroup(protein, "", "")); + proteinGroups.Add(allProteinGroups[protein]); + } + } + + bool isDecoy = split[32] == "Y"; + + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy); + ids.Add(id); + } + + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); + var results = engine.Run(); + + var f1r1MbrResults = results + .PeptideModifiedSequences + .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MBR && p.Value.GetDetectionType(j6) == DetectionType.MSMS).ToList(); + + Assert.That(f1r1MbrResults.Count >= 132); + + var f1r2MbrResults = results.PeptideModifiedSequences + .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MSMS && p.Value.GetDetectionType(j6) == DetectionType.MBR).ToList(); + + Assert.That(f1r2MbrResults.Count >= 77); + + List<(double, double)> peptideIntensities = new List<(double, double)>(); + + foreach (var peptide in f1r1MbrResults) + { + double mbrIntensity = Math.Log(peptide.Value.GetIntensity(j5)); + double msmsIntensity = Math.Log(peptide.Value.GetIntensity(j6)); + peptideIntensities.Add((mbrIntensity, msmsIntensity)); + } + + double corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); + Assert.Greater(corr, 0.8); + + peptideIntensities.Clear(); + foreach (var peptide in f1r2MbrResults) + { + double mbrIntensity = Math.Log(peptide.Value.GetIntensity(j6)); + double msmsIntensity = Math.Log(peptide.Value.GetIntensity(j5)); + peptideIntensities.Add((mbrIntensity, msmsIntensity)); + } + + corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); + + Assert.That(corr > 0.7); + + // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein + // has to be observed in a condition for match-between-runs + j5.Condition = "b"; + engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 5); + results = engine.Run(); + var proteinsObservedInF1 = ids.Where(p => p.FileInfo == j5).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF2 = ids.Where(p => p.FileInfo == j6).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF1ButNotF2 = proteinsObservedInF1.Except(proteinsObservedInF2).ToList(); + foreach (ProteinGroup protein in proteinsObservedInF1ButNotF2) + { + Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(j6) == 0); + } + } + [Test] public static void DecoyPeakFindTrial() { diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 8994b6996..96fcec2a6 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -1332,7 +1332,7 @@ public static void RealDataMbrTest() ids.Add(id); } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 1); + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); var results = engine.Run(); var f1r1MbrResults = results @@ -1373,7 +1373,7 @@ public static void RealDataMbrTest() // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein // has to be observed in a condition for match-between-runs f1r1.Condition = "b"; - engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1); + engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 5); results = engine.Run(); var proteinsObservedInF1 = ids.Where(p => p.FileInfo == f1r1).SelectMany(p => p.ProteinGroups).Distinct().ToList(); var proteinsObservedInF2 = ids.Where(p => p.FileInfo == f1r2).SelectMany(p => p.ProteinGroups).Distinct().ToList(); From 03efb21dd544bbd66d8768826b161a469bcf905a Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 2 Feb 2024 10:44:45 -0600 Subject: [PATCH 11/55] Deleted unused fields and references to PearsonCorrelation in isotopicEnvelope class and CheckEnvelope method --- mzLib/FlashLFQ/FlashLfqEngine.cs | 7 +++---- mzLib/FlashLFQ/IsotopicEnvelope.cs | 9 --------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 110f358a6..e1125ffcd 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -1186,7 +1186,7 @@ public List GetIsotopicEnvelopes( } } - isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum(), corr)); + isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum())); } } @@ -1207,10 +1207,9 @@ public bool CheckIsotopicEnvelopeCorrelation( Dictionary> massShiftToIsotopePeaks, IndexedMassSpectralPeak peak, int chargeState, - Tolerance isotopeTolerance, - out double pearsonCorrelation) + Tolerance isotopeTolerance) { - pearsonCorrelation = Correlation.Pearson( + double pearsonCorrelation = Correlation.Pearson( massShiftToIsotopePeaks[0].Select(p => p.expIntensity), massShiftToIsotopePeaks[0].Select(p => p.theorIntensity)); diff --git a/mzLib/FlashLFQ/IsotopicEnvelope.cs b/mzLib/FlashLFQ/IsotopicEnvelope.cs index 0875f340b..09d7207d7 100644 --- a/mzLib/FlashLFQ/IsotopicEnvelope.cs +++ b/mzLib/FlashLFQ/IsotopicEnvelope.cs @@ -10,7 +10,6 @@ public class IsotopicEnvelope /// public readonly IndexedMassSpectralPeak IndexedPeak; public readonly int ChargeState; - public readonly double PearsonCorrelation; public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity) { @@ -19,14 +18,6 @@ public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeStat Intensity = intensity / chargeState; } - public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity, double pearsonCorrelation) - { - IndexedPeak = monoisotopicPeak; - ChargeState = chargeState; - Intensity = intensity / chargeState; - PearsonCorrelation = pearsonCorrelation; - } - /// /// The summed intensity of all isotope peaks detected in one MS1 scan. This sum may contain /// imputed intensity values for expected isotopes that weren't observed, but only if the observed From 5eb2637cec31edc46ac5bb6ea364c9a4ba77da37 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 2 Feb 2024 10:46:20 -0600 Subject: [PATCH 12/55] Deleted unused using statements in FlashLFQ engine --- mzLib/FlashLFQ/FlashLfqEngine.cs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index e1125ffcd..a6812cfaa 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -1,17 +1,13 @@ using Chemistry; -using MassSpectrometry; using MathNet.Numerics.Distributions; -using MathNet.Numerics.LinearAlgebra.Factorization; using MathNet.Numerics.Statistics; using MzLibUtil; using Proteomics.AminoAcidPolymer; using System; -using System.Collections; using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; using System.Linq; -using System.Runtime; using System.Threading.Tasks; using UsefulProteomicsDatabases; using System.Runtime.CompilerServices; @@ -1174,7 +1170,7 @@ public List GetIsotopicEnvelopes( } // Check that the experimental envelope matches the theoretical - if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance, out var corr)) + if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance)) { // impute unobserved isotope peak intensities // TODO: Figure out why value imputation is performed. Build a toggle? From 1aeeb962ebd57f418d95ee2c47b1cb0c0427e031 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 2 Feb 2024 10:53:18 -0600 Subject: [PATCH 13/55] Commented + removed unused variables for PredicteRetentionTime method --- mzLib/FlashLFQ/FlashLfqEngine.cs | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index a6812cfaa..3478539a4 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -471,23 +471,29 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) _results.Peaks[fileInfo].AddRange(chromatographicPeaks.ToList()); } + /// + /// Used by MBR. Predicts the retention time of a peak in an acceptor file based on the + /// retention time of the peak in the donor file. This is done with a local alignment + /// where all peaks within 30 seconds of the donor peak are matched to peaks with the same associated peptide in the acceptor file, + /// if such a peak exists. + /// + /// Array of all shared peaks between the donor and the acceptor file + /// RtInfo object containing the predicted retention time of the acceptor peak and the width of the predicted retention time window internal RtInfo PredictRetentionTime( RetentionTimeCalibDataPoint[] rtCalibrationCurve, ChromatographicPeak donorPeak, - SpectraFileInfo acceptorFile, SpectraFileInfo donorFile, - bool acceptorSampleIsFractionated, bool donorSampleIsFractionated) + SpectraFileInfo acceptorFile, + bool acceptorSampleIsFractionated, + bool donorSampleIsFractionated) { var nearbyCalibrationPoints = new List(); - var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); - - nearbyCalibrationPoints.Clear(); // only compare +- 1 fraction if (acceptorSampleIsFractionated && donorSampleIsFractionated) { int acceptorFractionNumber = acceptorFile.Fraction; - int donorFractionNumber = donorFile.Fraction; + int donorFractionNumber = donorPeak.SpectraFileInfo.Fraction; if (Math.Abs(acceptorFractionNumber - donorFractionNumber) > 1) { @@ -495,8 +501,6 @@ internal RtInfo PredictRetentionTime( } } - Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - // binary search for this donor peak in the retention time calibration spline RetentionTimeCalibDataPoint testPoint = new RetentionTimeCalibDataPoint(donorPeak, null); int index = Array.BinarySearch(rtCalibrationCurve, testPoint); @@ -704,7 +708,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { ChromatographicPeak donorPeak = idDonorPeaks[i]; // TODO: Add a toggle that set rtRange to be maximum width - RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (rtInfo == null) continue; FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific); From 112ae41715a758af77d5a2ebfa151acbaa67d69f Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 2 Feb 2024 11:01:11 -0600 Subject: [PATCH 14/55] Added comments, removed unused using statements --- mzLib/FlashLFQ/FlashLfqEngine.cs | 23 +++++++++++++++++++---- mzLib/FlashLFQ/MbrScorer.cs | 6 ++++-- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 3478539a4..e26edb284 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -577,9 +577,15 @@ internal RtInfo PredictRetentionTime( return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); } + /// + /// Constructs a MbrScorer object that is used to score all MBR peaks for a given acceptor file + /// + /// All MSMS identified peaks in the acceptor file + /// A ppm tolerance specific to the given file + /// A MbrScorer object private MbrScorer BuildMbrScorer(List acceptorFileIdentifiedPeaks, out Tolerance fileSpecificMbrTolerance) { - // Ppm distribution + // Construct a distribution of ppm errors for all MSMS peaks in the acceptor file var apexToAcceptorFilePeakDict = new Dictionary(); List ppmErrors = new List(); foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null)) @@ -601,15 +607,15 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie double fileSpecificMbrPpmTolerance = Math.Min(Math.Abs(ppmErrors.Median()) + ppmSpread * 4, MbrPpmTolerance); fileSpecificMbrTolerance = new PpmTolerance(fileSpecificMbrPpmTolerance); // match between runs PPM tolerance - // Intensity Distribution + // Construct a distribution of peak log intensities for all MSMS peaks in the acceptor file var acceptorFileLogIntensities = acceptorFileIdentifiedPeaks .Where(p => p.Intensity > 0) .Select(p => Math.Log(p.Intensity, 2)) .ToList(); double medianAcceptorLogIntensity = acceptorFileLogIntensities.Median(); - Normal intensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36); + Normal logIntensityDistribution = new Normal(acceptorFileLogIntensities.Median(), acceptorFileLogIntensities.InterquartileRange() / 1.36); - return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, intensityDistribution); + return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, logIntensityDistribution); } /// @@ -793,6 +799,15 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) RunErrorChecking(idAcceptorFile); } + /// + /// Finds MBR acceptor peaks by looping through every possible peak for every possible charge state + /// in a given retention time range. Identified peaks are added to the matchBetweenRunsIdentifiedPeaks dictionary. + /// + /// The MbrScorer object used to score acceptor peaks + /// RtInfo object containing the predicted retention time for the acceptor peak and the width of the expected RT window + /// Ppm Tolerance specific to the acceptor file + /// The donor peak. Acceptor peaks are presumed to represent the same peptide ast he donor peak + /// A dictionary containing peptide sequences and their associated mbr peaks internal void FindAllAcceptorPeaks( SpectraFileInfo idAcceptorFile, MbrScorer scorer, diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 05f03da1a..924611e64 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -3,8 +3,6 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Text; -using System.Threading.Tasks; namespace FlashLFQ { @@ -41,6 +39,10 @@ internal MbrScorer( _logFcDistributionDictionary = new(); } + /// + /// Scores a MBR peak based on it's retention time, ppm error, and intensity + /// + /// The MBR score as a double. Higher scores are better. internal double ScoreMbr(Normal rtDistribution, double retentionTime, double ppmError, double acceptorIntensity, ChromatographicPeak? donorPeak = null) { double intensityDensity; From e3fe1e21cbbd377b2051c6b7d6422df0e29fa6b8 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 2 Feb 2024 11:02:17 -0600 Subject: [PATCH 15/55] Deleted unused variables --- mzLib/FlashLFQ/FlashLfqEngine.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index e26edb284..0efff5f32 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -707,7 +707,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (range, loopState) => { - var nearbyCalibrationPoints = new List(); var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); for (int i = range.Item1; i < range.Item2; i++) From 40d5f45e86499ec79a7493b496fcb4dbff7e7240 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 2 Feb 2024 14:14:56 -0600 Subject: [PATCH 16/55] Tests are still passing, but I'm about to break things --- mzLib/FlashLFQ/ChromatographicPeak.cs | 5 +- mzLib/FlashLFQ/FlashLfqEngine.cs | 119 +++++------- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 224 ++--------------------- 3 files changed, 69 insertions(+), 279 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 1efb53e37..503e4f59d 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -2,6 +2,7 @@ using MathNet.Numerics.Statistics; using System; using System.Collections.Generic; +using System.Configuration; using System.Linq; using System.Text; @@ -50,7 +51,7 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi /// public double? RtInterquartileRange { get; private set; } public bool RandomRt { get; } - public bool DecoyPeptide { get; } + public bool DecoyPeptide => Identifications.First().IsDecoy; public static string TabSeparatedHeader { @@ -255,7 +256,7 @@ public override string ToString() sb.Append("" + NumIdentificationsByFullSeq + "\t"); sb.Append("" + SplitRT + "\t"); sb.Append("" + MassError); - sb.Append("\t" + Identifications.First().IsDecoy); + sb.Append("\t" + DecoyPeptide); sb.Append("\t" + RandomRt); return sb.ToString(); diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 29b64c8b0..59ba55b71 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -736,14 +736,16 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) if (randomDonor == null) continue; // Map the random rt onto the new file - RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idDonorFile, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor,idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + if (decoyRtInfo == null) continue; // Find a decoy peak using the randomly drawn retention time - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, decoyRt:decoyRtInfo.PredictedRt); + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, randomRt:decoyRtInfo.PredictedRt); acceptorPeakDecoyPeakDict.TryAdd(donorPeak.Identifications.First(), (bestAcceptor, bestDecoy)); } }); } + // If we have multiple identification with the same sequence mapped to the same peak, we want to sum their MBR scores // This is done here foreach(var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) @@ -771,19 +773,19 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } - var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrTargetDecoyFirst.tsv")) - { - writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - foreach (var pair in test) - { - if (pair.Value.target != null) - writer.WriteLine(pair.Value.target.ToString()); - else - writer.WriteLine(""); - writer.WriteLine(pair.Value.decoy.ToString()); - } - } + //var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); + //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrTargetDecoyFirst.tsv")) + //{ + // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + // foreach (var pair in test) + // { + // if (pair.Value.target != null) + // writer.WriteLine(pair.Value.target.ToString()); + // else + // writer.WriteLine(""); + // writer.WriteLine(pair.Value.decoy.ToString()); + // } + //} // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) @@ -797,6 +799,10 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); ChromatographicPeak best = peakHypotheses.First(); + if(best.DecoyPeptide) + { + int placeholder = 0; + } peakHypotheses.Remove(best); @@ -808,6 +814,10 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) List peaksToRemoveFromHypotheses = new List(); foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) { + if (peak.DecoyPeptide) + { + int placeholder = 0; + } if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) { best.MergeFeatureWith(peak, Integrate); @@ -821,14 +831,14 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrAllMbrPeaks.tsv")) - { - writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - foreach (var peak in _results.Peaks[idAcceptorFile].Where(peak => peak.IsMbrPeak)) - { - writer.WriteLine(peak.ToString()); - } - } + //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrAllMbrPeaks.tsv")) + //{ + // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + // foreach (var peak in _results.Peaks[idAcceptorFile].Where(peak => peak.IsMbrPeak)) + // { + // writer.WriteLine(peak.ToString()); + // } + //} RunErrorChecking(idAcceptorFile); } @@ -850,14 +860,14 @@ internal void FindAllAcceptorPeaks( ChromatographicPeak donorPeak, ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks, out ChromatographicPeak bestAcceptor, - double? decoyRt = null) + double? randomRt = null) { // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; - double rtStartHypothesis = decoyRt == null ? rtInfo.RtStartHypothesis : (double)decoyRt - (rtInfo.Width / 2.0); - double rtEndHypothesis = decoyRt == null ? rtInfo.RtEndHypothesis : (double)decoyRt + (rtInfo.Width / 2.0); + double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - (rtInfo.Width / 2.0); + double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + (rtInfo.Width / 2.0); for (int j = 0; j < ms1ScanInfos.Length; j++) { @@ -882,7 +892,7 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - Normal rtScoringDistribution = decoyRt == null ? new Normal(rtInfo.PredictedRt, rtInfo.Width / 6) : new Normal((double)decoyRt, rtInfo.Width / 6); + Normal rtScoringDistribution = randomRt == null ? new Normal(rtInfo.PredictedRt, rtInfo.Width / 6) : new Normal((double)randomRt, rtInfo.Width / 6); bestAcceptor = null; foreach (int z in chargesToMatch) @@ -904,14 +914,14 @@ internal void FindAllAcceptorPeaks( // remove the clustered isotopic envelopes from the list of seeds after each iteration while (chargeEnvelopes.Any()) { - ChromatographicPeak acceptorPeak = FindAcceptorPeak(idAcceptorFile, scorer, donorPeak, - fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes, isDecoy: decoyRt != null); + ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, + fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes, randomRt: randomRt != null); if (acceptorPeak == null) continue; if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) bestAcceptor = acceptorPeak; - if (decoyRt != null) - continue; // We don't want to store the decoys in mbrIdentifiedPeaks right now + //if (decoyRt != null) + // continue; // We don't want to store the decoys in mbrIdentifiedPeaks right now // save the peak hypothesis matchBetweenRunsIdentifiedPeaks.AddOrUpdate @@ -935,42 +945,6 @@ internal void FindAllAcceptorPeaks( return envelopePeakListDict; } ); - - // save the peak hypothesis - // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak - //if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var envelopePeaksKvp)) - //{ - // envelopePeaksKvp.AddOrUpdate(acceptorPeak.Apex, new List { acceptorPeak }, // just add the new key value pair if it doesnt already exist - // (key, peakList) => { peakList.Add(acceptorPeak); return peakList; }); // if key already exists, we simply add the acceptorPeak to the list - //if (envelopePeaksKvp.TryGetValue(acceptorPeak.Apex, out List existing)) - //{ - // var samePeakSameSequence = existing - // .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - // if (samePeakSameSequence != null) - // { - // samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; - // samePeakSameSequence.Identifications.Add(donorIdentification); - // } - // else - // { - // existing.Add(acceptorPeak); - // } - //} - //else - //{ - // envelopePeaksKvp.AddOrUpdate(acceptorPeak.Apex, new List { acceptorPeak }, // just add the new key value pair if it doesnt already exist - // (key, peakList) => { peakList.Add(acceptorPeak); return peakList; }); // if key already exists, we simply add the acceptorPeak to the list - //} - //} - //else - //{ - // if(!matchBetweenRunsIdentifiedPeaksThreadSpecific.TryAdd(donorIdentification.ModifiedSequence, new ConcurrentDictionary>())) - // { - - // } - // matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].TryAdd(acceptorPeak.Apex, new List { acceptorPeak }); - //} } } } @@ -995,10 +969,10 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( Normal rtScoringDistribution, int z, List chargeEnvelopes, - bool isDecoy = false) + bool randomRt = false) { var donorId = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, isDecoy); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, randomRt); IsotopicEnvelope seedEnv = chargeEnvelopes.First(); var xic = Peakfind(seedEnv.IndexedPeak.RetentionTime, donorId.PeakfindingMass, z, idAcceptorFile, mbrTol); @@ -1009,9 +983,10 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( CutPeak(acceptorPeak, seedEnv.IndexedPeak.RetentionTime); - var claimedPeaks = new HashSet(acceptorPeak.IsotopicEnvelopes.Select(p => p.IndexedPeak)); - claimedPeaks.Add(seedEnv.IndexedPeak); // prevents infinite loops - + var claimedPeaks = new HashSet(acceptorPeak.IsotopicEnvelopes.Select(p => p.IndexedPeak)) + { + seedEnv.IndexedPeak // prevents infinite loops + }; chargeEnvelopes.RemoveAll(p => claimedPeaks.Contains(p.IndexedPeak)); // peak has already been identified by MSMS - skip it diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index cf126e48c..fe902088e 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -100,6 +100,25 @@ public static void TwoFileMbrTest() var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); var results = engine.Run(); + var test = results.Peaks.Values.SelectMany(peakList => peakList).ToList(); + int place = 0; + + List mbrPeaks = new(); + + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.RandomRt).ToList()); + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide).ToList()); + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); + + + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\AllMbrPeaks.tsv")) + { + writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + foreach (var peak in mbrPeaks) + { + writer.WriteLine(peak); + } + } + var f1r1MbrResults = results .PeptideModifiedSequences .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MBR && p.Value.GetDetectionType(j6) == DetectionType.MSMS).ToList(); @@ -149,210 +168,5 @@ public static void TwoFileMbrTest() } } - [Test] - public static void DecoyPeakFindTrial() - { - string decoyPeptidePath = @"C:\Users\Alex\Source\Repos\chronologer\chronologer-main(noChange)\Arabidopsis_Tryptic_Peptides_Slice.tsv"; - string spectraFilePath = @"D:\SingleCellDataSets\Organoid\Calibration_MM_320\Task1-CalibrateTask\HFL1SC_Unhealthy_CH2_J5-calib.mzML"; - ProteinGroup pg = new ProteinGroup("xyz", "x", "z"); - List pgs = new List { pg }; - SpectraFileInfo j5 = new SpectraFileInfo(spectraFilePath, "A", 1, 1, 1); - double rtRange = 4.0; - - List decoys = new(); - Loaders.LoadElements(); - - using (StreamReader reader = new StreamReader(decoyPeptidePath)) - { - reader.ReadLine(); - while(!reader.EndOfStream) - { - string[] lineSplit = reader.ReadLine().Split('\t'); - if (double.Parse(lineSplit[4]) > 57.5) continue; - - Peptide peptide = new Peptide(sequence: lineSplit[0]); - Identification decoyId = new Identification(j5, peptide.BaseSequence, peptide.BaseSequence, - peptide.MonoisotopicMass, ms2RetentionTimeInMinutes: double.Parse(lineSplit[4]), - chargeState: peptide.MonoisotopicMass > 1600 ? 3 : 2, pgs); - decoys.Add(decoyId); - } - } - - - FlashLfqEngine engine = new FlashLfqEngine( - decoys - ); - - engine.CalculateTheoreticalIsotopeDistributions(); - engine._ms1Scans = new Dictionary(); - engine._peakIndexingEngine.IndexMassSpectralPeaks(j5, true, engine._ms1Scans); - - - List decoyPeaks = new(); - List massDifferences = new(); - Dictionary apexPeakDict = new(); - int decoysConsidered = 0; - Random rnd = new Random(); - foreach (Identification decoy in decoys) - { - - int rndInt = rnd.Next(1, 13); - // Eliminate ~ half of the decoys with mass greater than 2000 daltons - // This is an ad-hoc way of matching target and decoy mass distribution - if (decoy.PeakfindingMass > 1800 && rndInt % 2 == 0) continue; - else if (decoy.PeakfindingMass > 1400 && rndInt % 3 == 0) continue; - - if (decoy.Ms2RetentionTimeInMinutes < 8 && rndInt < 11) continue; - - PpmTolerance tolerance = new PpmTolerance(10); - var foundPeak = engine.FindDecoyPeak( - j5, - apexPeakDict, - tolerance, - (decoy.Ms2RetentionTimeInMinutes, rtRange, null, null), - decoy); - - if (foundPeak != null) - { - decoyPeaks.Add(foundPeak); - massDifferences.Add( - Math.Abs( - decoy.PeakfindingMass - foundPeak.Apex.IndexedPeak.Mz.ToMass(foundPeak.Apex.ChargeState) - )); - } - - decoysConsidered++; - if (decoyPeaks.Count >= 750) break; - } - - int placeholder = 0; - - double massDiffMean = massDifferences.Select(m => Math.Abs(m)).Average(); - double envelopeCountMean = decoyPeaks.Select(peak => peak.IsotopicEnvelopes.Count).Average(); - double intensityMean = decoyPeaks.Select(peak => peak.IsotopicEnvelopes.Select(e => e.Intensity).Average()).Average(); - - placeholder = 1; - - // Repeat, but for targets - string targetPeptidePath = @"C:\Users\Alex\Source\Repos\chronologer\chronologer-main(noChange)\Unhealthy_CH2_J5_MBR_Predicted.tsv"; - //For MBR Predicted file - int fullSeqCol = 3; - int massCol = 5; - int rtColumn = 24; - - List targetIDs = new(); - using (StreamReader reader = new StreamReader(targetPeptidePath)) - { - reader.ReadLine(); - while (!reader.EndOfStream) - { - string[] lineSplit = reader.ReadLine().Split('\t'); - if (lineSplit[fullSeqCol].Contains('[')) continue; - if (double.Parse(lineSplit[rtColumn]) > 60) continue; - - - Peptide peptide = new Peptide(sequence: lineSplit[fullSeqCol]); - Identification targetId = new Identification(j5, peptide.BaseSequence, peptide.BaseSequence, - peptide.MonoisotopicMass, ms2RetentionTimeInMinutes: double.Parse(lineSplit[rtColumn]), - chargeState: peptide.MonoisotopicMass > 1600 ? 3 : 2, pgs); - targetIDs.Add(targetId); - } - } - - - engine = new FlashLfqEngine( - targetIDs - ); - - engine.CalculateTheoreticalIsotopeDistributions(); - engine._ms1Scans = new Dictionary(); - engine._peakIndexingEngine.IndexMassSpectralPeaks(j5, true, engine._ms1Scans); - - - List targetPeaks = new(); - List massDifferencesTarget = new(); - Dictionary apexPeakDictTarget = new(); - int targetsConsidered = 0; - foreach (Identification target in targetIDs) - { - PpmTolerance tolerance = new PpmTolerance(10); - var foundPeak = engine.FindDecoyPeak( - j5, - apexPeakDictTarget, - tolerance, - (target.Ms2RetentionTimeInMinutes, rtRange, null, null), - target); - - if (foundPeak != null) - { - targetPeaks.Add(foundPeak); - massDifferencesTarget.Add( - Math.Abs( - target.PeakfindingMass - foundPeak.Apex.IndexedPeak.Mz.ToMass(foundPeak.Apex.ChargeState) - )); - } - - targetsConsidered++; - if (targetPeaks.Count >= 750) break; - } - - double massDiffMeanT = massDifferencesTarget.Select(m => Math.Abs(m)).Average(); - double envelopeCountMeanT = targetPeaks.Select(peak => peak.IsotopicEnvelopes.Count).Average(); - double intensityMeanT = targetPeaks.Select(peak => peak.IsotopicEnvelopes.Select(e => e.Intensity).Average()).Average(); - - placeholder = 2; - - using(StreamWriter writer = new StreamWriter(@"C:\Users\Alex\Desktop\MBR_10_30\Take8_MbrTargetRT_4MinWindow.tsv")) - { - string[] header = new string[] - { - "Sequence", - "Target/Decoy", - "Theoretical Peak Finding Mass", - "Found Mass", - "Number of Scans", - "Apex Intensity", - "Retention Time", - "Predicted Retention Time" - }; - writer.WriteLine(string.Join('\t', header)); - - foreach (var decoy in decoyPeaks) - { - double peakFindingMass = decoy.Identifications.First().PeakfindingMass; - header = new string[] - { - decoy.Identifications.First().BaseSequence, - "D", - decoy.Identifications.First().PeakfindingMass.ToString(), - decoy.Apex.IndexedPeak.Mz.ToMass(decoy.Apex.ChargeState).ToString(), - decoy.IsotopicEnvelopes.Count.ToString(), - decoy.Apex.Intensity.ToString(), - decoy.Apex.IndexedPeak.RetentionTime.ToString(), - decoy.Identifications.First().Ms2RetentionTimeInMinutes.ToString() - }; - writer.WriteLine(string.Join('\t', header)); - } - - foreach (var target in targetPeaks) - { - double peakFindingMass = target.Identifications.First().PeakfindingMass; - header = new string[] - { - target.Identifications.First().BaseSequence, - "T", - target.Identifications.First().PeakfindingMass.ToString(), - target.Apex.IndexedPeak.Mz.ToMass(target.Apex.ChargeState).ToString(), - target.IsotopicEnvelopes.Count.ToString(), - target.Apex.Intensity.ToString(), - target.Apex.IndexedPeak.RetentionTime.ToString(), - target.Identifications.First().Ms2RetentionTimeInMinutes.ToString() - }; - writer.WriteLine(string.Join('\t', header)); - } - } - - } - } } From 4f3fc9df291d62c2c88efd1c1a2fc07e209b4c9b Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 2 Feb 2024 16:29:38 -0600 Subject: [PATCH 17/55] Edited scoring method --- mzLib/FlashLFQ/ChromatographicPeak.cs | 14 ++++++ mzLib/FlashLFQ/FlashLfqEngine.cs | 6 +-- mzLib/FlashLFQ/MbrScorer.cs | 62 +++++++++++++++++++----- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 2 +- mzLib/TestFlashLFQ/TestFlashLFQ.cs | 2 +- 5 files changed, 69 insertions(+), 17 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 503e4f59d..8f0f26183 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -11,11 +11,17 @@ namespace FlashLFQ public class ChromatographicPeak { public double Intensity; + public double ApexRetentionTime => Apex.IndexedPeak.RetentionTime; public readonly SpectraFileInfo SpectraFileInfo; public List IsotopicEnvelopes; + public int ScanCount => IsotopicEnvelopes.Count; public double SplitRT; public readonly bool IsMbrPeak; public double MbrScore; + public double PpmScore { get; set; } + public double IntensityScore { get; set; } + public double RtScore { get; set; } + public double ScanCountScore { get; set; } public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) { @@ -75,6 +81,10 @@ public static string TabSeparatedHeader sb.Append("Num Charge States Observed" + "\t"); sb.Append("Peak Detection Type" + "\t"); sb.Append("MBR Score" + "\t"); + sb.Append("Ppm Score" + "\t"); + sb.Append("Intensity Score" + "\t"); + sb.Append("Rt Score" + "\t"); + sb.Append("Scan Count Score" + "\t"); sb.Append("PSMs Mapped" + "\t"); sb.Append("Base Sequences Mapped" + "\t"); sb.Append("Full Sequences Mapped" + "\t"); @@ -250,6 +260,10 @@ public override string ToString() } sb.Append("" + (IsMbrPeak ? MbrScore.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? PpmScore.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? IntensityScore.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? RtScore.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? ScanCountScore.ToString() : "") + "\t"); sb.Append("" + Identifications.Count + "\t"); sb.Append("" + NumIdentificationsByBaseSeq + "\t"); diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 59ba55b71..3c4c613fe 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -995,10 +995,8 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return null; } - acceptorPeak.MbrScore = scorer.ScoreMbr(rtScoringDistribution, - retentionTime: acceptorPeak.Apex.IndexedPeak.RetentionTime, - ppmError: acceptorPeak.MassError, - acceptorIntensity: acceptorPeak.Intensity, + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, + rtScoringDistribution, donorPeak); return acceptorPeak; diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index e634e4e85..9ce66ddbe 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -2,6 +2,7 @@ using MathNet.Numerics.Statistics; using System; using System.Collections.Generic; +using System.Data; using System.Linq; namespace FlashLFQ @@ -15,11 +16,13 @@ internal class MbrScorer // Intensity and ppm distribution are specific to each acceptor file private readonly Normal _logIntensityDistribution; private readonly Normal _ppmDistribution; + private readonly Normal _scanCountDistribution; // The logFcDistributions are unique to each donor file - acceptor file pair private Dictionary _logFcDistributionDictionary; internal Dictionary ApexToAcceptorFilePeakDict { get; } internal List UnambiguousMsMsAcceptorPeaks { get; } + internal double MaxNumberOfScansObserved { get; } /// @@ -34,28 +37,57 @@ internal MbrScorer( { ApexToAcceptorFilePeakDict = apexToAcceptorFilePeakDict; UnambiguousMsMsAcceptorPeaks = acceptorPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); + MaxNumberOfScansObserved = acceptorPeaks.Max(peak => peak.ScanCount); _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; _logFcDistributionDictionary = new(); + // This is kludgey, because scan counts are discrete + List scanList = acceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); + // build a normal distribution for the scan list of the acceptor peaks + _scanCountDistribution = new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36); } /// /// Scores a MBR peak based on it's retention time, ppm error, and intensity /// /// The MBR score as a double. Higher scores are better. - internal double ScoreMbr(Normal rtDistribution, double retentionTime, double ppmError, double acceptorIntensity, ChromatographicPeak? donorPeak = null) + internal double ScoreMbr(ChromatographicPeak acceptorPeak, Normal rtDistribution, ChromatographicPeak? donorPeak = null) + { + acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); + acceptorPeak.RtScore = CalculateScore(rtDistribution, acceptorPeak.ApexRetentionTime); + acceptorPeak.PpmScore = CalculateScore(_ppmDistribution, acceptorPeak.MassError); + acceptorPeak.ScanCountScore = CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); + //acceptorPeak.ScanCountScore = (double)acceptorPeak.ScanCount / _scanCountDistribution.Median; + + double donorIdPEP = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First().PosteriorErrorProbability; + + return (acceptorPeak.IntensityScore + acceptorPeak.RtScore + acceptorPeak.PpmScore + acceptorPeak.ScanCountScore) * (1 - donorIdPEP); + } + + internal double CalculateScore(Normal distribution, double value) + { + // new method + double absoluteDiffFromMean = Math.Abs(distribution.Mean - value); + // Returns a value between (0, 1] where 1 means the value was equal to the distribution mean + return 2 * distribution.CumulativeDistribution(distribution.Mean - absoluteDiffFromMean); + + // old method + //return DensityScoreConversion(distribution.Density(value)); + } + + internal double CalculateIntensityScore(double acceptorIntensity, ChromatographicPeak donorPeak) { - double intensityDensity; if (donorPeak != null && acceptorIntensity != 0 && donorPeak.Intensity != 0 && _logFcDistributionDictionary.TryGetValue(donorPeak.SpectraFileInfo, out var logFcDistribution)) { - intensityDensity = logFcDistribution.Density( - Math.Log(acceptorIntensity, 2) - Math.Log(donorPeak.Intensity, 2) - ); + var logFoldChange = Math.Log(acceptorIntensity, 2) - Math.Log(donorPeak.Intensity, 2); + return CalculateScore(logFcDistribution, logFoldChange); } else { var logIntensity = Math.Log(acceptorIntensity, 2); + return CalculateScore(_logIntensityDistribution, logIntensity); + // I don't know what the if/else statement accomplishes. It feels like we should take the density regardless // As it is, the score is artifically inflated for very intense peaks @@ -65,16 +97,23 @@ internal double ScoreMbr(Normal rtDistribution, double retentionTime, double ppm // intensityDensity = _logIntensityDistribution.Density(_logIntensityDistribution.Mode); //alternate, more straightforward approach - intensityDensity = _logIntensityDistribution.Density(logIntensity); } - double intensityScore = DensityScoreConversion(intensityDensity); - double ppmScore = DensityScoreConversion(_ppmDistribution.Density(ppmError)); - double rtScore = DensityScoreConversion(rtDistribution.Density(retentionTime)); + } - double donorIdPEP = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First().PosteriorErrorProbability; + internal double CalculatePpmScore(double ppmError) + { + return DensityScoreConversion(_ppmDistribution.Density(ppmError)); + } - return (ppmScore + rtScore + intensityScore) * (1 - donorIdPEP); + internal double CalculateRtScore(double retentionTime, Normal rtDistribution) + { + return DensityScoreConversion(rtDistribution.Density(retentionTime)); + } + + internal double CalculateScanCountScore(int scanCount) + { + return (double)scanCount / (double)MaxNumberOfScansObserved; } /// @@ -130,6 +169,7 @@ internal void CalculateFoldChangeBetweenFiles(List idDonorP _logFcDistributionDictionary.Add(idDonorPeaks.First().SpectraFileInfo, foldChangeDistribution); } } + /// /// Takes in the density of a normal distribution at a given point, and transforms it /// by taking the log of the density plus the square root of the squared density plus one diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index fe902088e..a0a0b2bab 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -110,7 +110,7 @@ public static void TwoFileMbrTest() mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\AllMbrPeaks.tsv")) + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\AllMbrPeaks_NewScoringMethod_plusScans.tsv")) { writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); foreach (var peak in mbrPeaks) diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 96fcec2a6..c5f5497de 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -1344,7 +1344,7 @@ public static void RealDataMbrTest() var f1r2MbrResults = results.PeptideModifiedSequences .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MSMS && p.Value.GetDetectionType(f1r2) == DetectionType.MBR).ToList(); - Assert.That(f1r2MbrResults.Count >= 77); + Assert.GreaterOrEqual(f1r2MbrResults.Count, 77); List<(double, double)> peptideIntensities = new List<(double, double)>(); From 79b2890c5de3546d7470ccf51332565601e4ef25 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 6 Feb 2024 17:59:56 -0600 Subject: [PATCH 18/55] Umm, edited scoring, increased ppm tolerance, changed minimum rt width for variance and random rt transfers --- mzLib/FlashLFQ/ChromatographicPeak.cs | 5 +- mzLib/FlashLFQ/FlashLFQResults.cs | 21 ++++- mzLib/FlashLFQ/FlashLfqEngine.cs | 114 ++++++++++++++--------- mzLib/FlashLFQ/MbrScorer.cs | 7 +- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 20 +++- 5 files changed, 115 insertions(+), 52 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 8f0f26183..c0490ba65 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -22,6 +22,7 @@ public class ChromatographicPeak public double IntensityScore { get; set; } public double RtScore { get; set; } public double ScanCountScore { get; set; } + public string Collision { get; set; } public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) { @@ -92,6 +93,7 @@ public static string TabSeparatedHeader sb.Append("Peak Apex Mass Error (ppm)"); sb.Append("\t" + "Decoy Peptide"); sb.Append("\t" + "Random Rt"); + sb.Append("\t" + "Collision"); //sb.Append("Timepoints"); return sb.ToString(); } @@ -272,7 +274,8 @@ public override string ToString() sb.Append("" + MassError); sb.Append("\t" + DecoyPeptide); sb.Append("\t" + RandomRt); - + sb.Append("\t" + Collision ?? ""); + return sb.ToString(); } } diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index ac9c36189..c6218bc81 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -1,4 +1,5 @@ -using MathNet.Numerics.Statistics; +using Easy.Common.Extensions; +using MathNet.Numerics.Statistics; using System; using System.Collections.Generic; using System.IO; @@ -13,6 +14,7 @@ public class FlashLfqResults public readonly Dictionary PeptideModifiedSequences; public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; + public IEnumerable DecoyPeaks { get; set; } public FlashLfqResults(List spectraFiles, List identifications) { @@ -539,7 +541,7 @@ public void CalculateProteinResultsMedianPolish(bool useSharedPeptides) } } - public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent) + public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent, string decoyPath = null) { if (!silent) { @@ -561,6 +563,21 @@ public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, st } } + if(decoyPath != null & DecoyPeaks.IsNotNullOrEmpty()) + { + using (StreamWriter output = new StreamWriter(decoyPath)) + { + output.WriteLine(ChromatographicPeak.TabSeparatedHeader); + + foreach (var peak in DecoyPeaks + .OrderBy(p => p.SpectraFileInfo.FilenameWithoutExtension) + .ThenByDescending(p => p.MbrScore)) + { + output.WriteLine(peak.ToString()); + } + } + } + if (modPeptideOutputPath != null) { using (StreamWriter output = new StreamWriter(modPeptideOutputPath)) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 3c4c613fe..9d0aed9b7 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -67,6 +67,7 @@ public class FlashLfqEngine private FlashLfqResults _results; internal Dictionary _ms1Scans; internal PeakIndexingEngine _peakIndexingEngine; + internal ConcurrentBag DecoyPeaks { get; private set; } public FlashLfqEngine( List allIdentifications, @@ -112,6 +113,10 @@ public FlashLfqEngine( PpmTolerance = ppmTolerance; IsotopePpmTolerance = isotopeTolerancePpm; MatchBetweenRuns = matchBetweenRuns; + if(MatchBetweenRuns) + { + DecoyPeaks = new(); + } MbrPpmTolerance = matchBetweenRunsPpmTolerance; Integrate = integrate; NumIsotopesRequired = numIsotopesRequired; @@ -206,6 +211,7 @@ public FlashLfqResults Run() Console.WriteLine("Finished MBR for " + spectraFile.FilenameWithoutExtension); } } + _results.DecoyPeaks = DecoyPeaks; } // normalize @@ -573,7 +579,8 @@ internal RtInfo PredictRetentionTime( rtRange = (double)rtInterquartileRange * 4.5; // Multiplication inherited from legacy code, unsure of reason for 4.5 } - rtRange = Math.Min(rtRange, MbrRtWindow); + //TODO: Expand range and see what happens + rtRange = Math.Min(rtRange+1, MbrRtWindow+1); return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); } @@ -646,6 +653,8 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) if (scorer == null) return; + mbrTol = new PpmTolerance(50); + // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier _peakIndexingEngine.DeserializeIndex(idAcceptorFile); @@ -665,7 +674,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // this stores the results of MBR ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); Random randomGenerator = new Random(); - ConcurrentDictionary acceptorPeakDecoyPeakDict = new(); + //ConcurrentDictionary acceptorPeakDecoyPeakDict = new(); // map each donor file onto this file foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) @@ -723,8 +732,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // Draw a random donor that has an rt sufficiently far enough away var randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; int randomPeaksSampled = 0; + double minimumDifference = Math.Min(rtInfo.Width * 1.25, 1); while(randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence - || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < rtInfo.Width*1.25) // multiply for safety, in case the relative rt shifts after alignment + || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumDifference) // multiply for safety, in case the relative rt shifts after alignment { randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; if (randomPeaksSampled++ > (idDonorPeaks.Count - 1)) @@ -740,52 +750,60 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) if (decoyRtInfo == null) continue; // Find a decoy peak using the randomly drawn retention time FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, randomRt:decoyRtInfo.PredictedRt); - acceptorPeakDecoyPeakDict.TryAdd(donorPeak.Identifications.First(), (bestAcceptor, bestDecoy)); + if(bestDecoy != null) + { + DecoyPeaks.Add(bestDecoy); + } + //acceptorPeakDecoyPeakDict.TryAdd(donorPeak.Identifications.First(), (bestAcceptor, bestDecoy)); } }); } + //var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); + //if(test.Count > 100) + //{ + // using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrTargetDecoy_WideTolerance_wideRt.tsv")) + // { + // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + // foreach (var pair in test) + // { + // if (pair.Value.target != null) + // writer.WriteLine(pair.Value.target.ToString()); + // else + // writer.WriteLine(""); + // writer.WriteLine(pair.Value.decoy.ToString()); + // } + // } + //} // If we have multiple identification with the same sequence mapped to the same peak, we want to sum their MBR scores // This is done here - foreach(var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) + foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) { // Each isotopic envelope is linked to a list of ChromatographicPeaks // If multiple chromatographic peaks are linked, each with the same peptide identification, then their mbr scores are summed // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. - foreach (var envelopeListKvp in seqDictionaryKvp.Value) + foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) { List collapsedPeaks = new(); - foreach(var peakGroup in envelopeListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) + foreach(var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) { - var scoreSum = peakGroup.Sum(peak => peak.MbrScore); + //var scoreSum = peakGroup.Sum(peak => peak.MbrScore); var idList = peakGroup.Select(peak => peak.Identifications.First()).Distinct().ToList(); // This is fine, because each mbrPeak only has one identification - var collapsedPeak = peakGroup.First(); - collapsedPeak.MbrScore = scoreSum; + var collapsedPeak = peakGroup.OrderBy(peak => peak.MbrScore).First(); + //collapsedPeak.MbrScore = peakGroup.Max(peak => peak.MbrScore); // Lmao, these three lines are problematic. Should probably do something about them collapsedPeak.Identifications.Clear(); collapsedPeak.Identifications.AddRange(idList); collapsedPeak.ResolveIdentifications(); collapsedPeaks.Add(collapsedPeak); } - envelopeListKvp.Value.Clear(); - envelopeListKvp.Value.AddRange(collapsedPeaks); + envelopePeakListKvp.Value.Clear(); + envelopePeakListKvp.Value.AddRange(collapsedPeaks); } } - //var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); - //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrTargetDecoyFirst.tsv")) - //{ - // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - // foreach (var pair in test) - // { - // if (pair.Value.target != null) - // writer.WriteLine(pair.Value.target.ToString()); - // else - // writer.WriteLine(""); - // writer.WriteLine(pair.Value.decoy.ToString()); - // } - //} + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) @@ -799,11 +817,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); ChromatographicPeak best = peakHypotheses.First(); - if(best.DecoyPeptide) - { - int placeholder = 0; - } - peakHypotheses.Remove(best); if (peakHypotheses.Count > 0) @@ -814,10 +827,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) List peaksToRemoveFromHypotheses = new List(); foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) { - if (peak.DecoyPeptide) - { - int placeholder = 0; - } if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) { best.MergeFeatureWith(peak, Integrate); @@ -892,7 +901,8 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - Normal rtScoringDistribution = randomRt == null ? new Normal(rtInfo.PredictedRt, rtInfo.Width / 6) : new Normal((double)randomRt, rtInfo.Width / 6); + double rtVariance = Math.Min((rtInfo.Width - 1) / 6, 0.05); // Minimum standard deviation of 3 seconds + Normal rtScoringDistribution = randomRt == null ? new Normal(rtInfo.PredictedRt, rtVariance) : new Normal((double)randomRt, rtVariance); bestAcceptor = null; foreach (int z in chargesToMatch) @@ -1097,20 +1107,19 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } IndexedMassSpectralPeak apexImsPeak = tryPeak.Apex.IndexedPeak; - if (errorCheckedPeaksGroupedByApex.TryGetValue(apexImsPeak, out ChromatographicPeak storedPeak)) + if (errorCheckedPeaksGroupedByApex.TryGetValue(apexImsPeak, out ChromatographicPeak storedPeak) && storedPeak != null) { - if (tryPeak.IsMbrPeak && storedPeak == null) - { - continue; - } - + //if (tryPeak.IsMbrPeak && storedPeak == null) + //{ + // continue; + //} if (!tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { storedPeak.MergeFeatureWith(tryPeak, Integrate); } else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - continue; + continue; // Default to MSMS peaks over MBR Peaks } else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak) { @@ -1127,6 +1136,27 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) else { errorCheckedPeaksGroupedByApex.Add(apexImsPeak, tryPeak); + + } + } + + foreach(var peak in DecoyPeaks.Where(peak => peak.SpectraFileInfo == spectraFile)) + { + var apexIms = peak.Apex.IndexedPeak; + if(errorCheckedPeaksGroupedByApex.TryGetValue(apexIms, out var collisionPeak)) + { + if(collisionPeak.IsMbrPeak) + { + peak.Collision = "MBR"; + } + else + { + peak.Collision = "MSMS"; + } + } + else + { + peak.Collision = "N/A"; } } diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 9ce66ddbe..de4ed4be1 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -50,7 +50,7 @@ internal MbrScorer( /// /// Scores a MBR peak based on it's retention time, ppm error, and intensity /// - /// The MBR score as a double. Higher scores are better. + /// An MBR Score ranging between 0 and 100. Higher scores are better. internal double ScoreMbr(ChromatographicPeak acceptorPeak, Normal rtDistribution, ChromatographicPeak? donorPeak = null) { acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); @@ -60,8 +60,9 @@ internal double ScoreMbr(ChromatographicPeak acceptorPeak, Normal rtDistribution //acceptorPeak.ScanCountScore = (double)acceptorPeak.ScanCount / _scanCountDistribution.Median; double donorIdPEP = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First().PosteriorErrorProbability; - - return (acceptorPeak.IntensityScore + acceptorPeak.RtScore + acceptorPeak.PpmScore + acceptorPeak.ScanCountScore) * (1 - donorIdPEP); + + // Returns 100 times the geometric mean of the four scores + return 100 * Math.Pow( acceptorPeak.IntensityScore * acceptorPeak.RtScore * acceptorPeak.PpmScore * acceptorPeak.ScanCountScore, 0.25); } internal double CalculateScore(Normal distribution, double value) diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index a0a0b2bab..25cfc2f58 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -44,7 +44,8 @@ public static int TestDecoySearchFlipFlop(int searchCount) // This is gonna have a bunch of local file references, just a heads up. Dont make github try and build this one public static void TwoFileMbrTest() { - string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\subset_psms.psmtsv"; + //string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\subset_psms.psmtsv"; + string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\AllPSMs_1PercentFdr.psmtsv"; SpectraFileInfo j5 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J5.raw", "a", 0, 0, 0); SpectraFileInfo j6 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J6.raw", "a", 1, 0, 0); @@ -97,6 +98,7 @@ public static void TwoFileMbrTest() ids.Add(id); } + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); var results = engine.Run(); @@ -105,12 +107,13 @@ public static void TwoFileMbrTest() List mbrPeaks = new(); - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.RandomRt).ToList()); - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide).ToList()); + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.RandomRt && !peak.DecoyPeptide).ToList()); + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide && !peak.RandomRt).ToList()); + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide && peak.RandomRt).ToList()); mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\AllMbrPeaks_NewScoringMethod_plusScans.tsv")) + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_minRtDiff.tsv")) { writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); foreach (var peak in mbrPeaks) @@ -119,6 +122,15 @@ public static void TwoFileMbrTest() } } + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) + { + writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + foreach (var peak in engine.DecoyPeaks) + { + writer.WriteLine(peak); + } + } + var f1r1MbrResults = results .PeptideModifiedSequences .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MBR && p.Value.GetDetectionType(j6) == DetectionType.MSMS).ToList(); From 5d5c1eeae68a656f901cd066e3c57c61a7527b4b Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 7 Feb 2024 17:17:41 -0600 Subject: [PATCH 19/55] Did some weird filtering of donor files that should probably be reverted --- mzLib/FlashLFQ/ChromatographicPeak.cs | 2 +- mzLib/FlashLFQ/FlashLfqEngine.cs | 77 +++++++++- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 188 ++++++++++++++++++++++- mzLib/mzLib.nuspec | 2 +- 4 files changed, 265 insertions(+), 4 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index c0490ba65..e915b222b 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -11,7 +11,7 @@ namespace FlashLFQ public class ChromatographicPeak { public double Intensity; - public double ApexRetentionTime => Apex.IndexedPeak.RetentionTime; + public double ApexRetentionTime => Apex?.IndexedPeak.RetentionTime ?? -1; public readonly SpectraFileInfo SpectraFileInfo; public List IsotopicEnvelopes; public int ScanCount => IsotopicEnvelopes.Count; diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 9d0aed9b7..7a6fce5b0 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -39,6 +39,7 @@ public class FlashLfqEngine public readonly double MbrRtWindow; public readonly double MbrPpmTolerance; + public readonly double MbrAlignmentWindow = 0.5; public readonly bool RequireMsmsIdInCondition; // settings for the Bayesian protein quantification engine @@ -67,6 +68,7 @@ public class FlashLfqEngine private FlashLfqResults _results; internal Dictionary _ms1Scans; internal PeakIndexingEngine _peakIndexingEngine; + internal Dictionary> DonorFileToModSeqDict { get; private set; } internal ConcurrentBag DecoyPeaks { get; private set; } public FlashLfqEngine( @@ -195,6 +197,7 @@ public FlashLfqResults Run() // do MBR if (MatchBetweenRuns) { + FindPeptideDonorFiles(); foreach (var spectraFile in _spectraFileInfo) { if (!Silent) @@ -626,6 +629,54 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, logIntensityDistribution); } + /// + /// For every MSMS identified peptide, selects one file that will be used as the donor + /// by finding files that contain the most peaks in the local neighborhood, + /// then writes the restults to the DonorFileToIdsDict + /// + private void FindPeptideDonorFiles() + { + DonorFileToModSeqDict = new Dictionary>(); + + // iterate through each unique sequence + foreach (string modSeq in _modifiedSequenceToIsotopicDistribution.Keys) + { + SpectraFileInfo bestDonor = null; + int bestDonorNeighborPeakCount = 0; + foreach(SpectraFileInfo file in _spectraFileInfo) + { + var peaksForPeptide = _results.Peaks[file].Where(peak => + peak.Identifications.Any(id => id.ModifiedSequence.Equals(modSeq))); + int neighboringPeakCountMax = 0; + foreach(var donorPeak in peaksForPeptide) + { + // Count the number of neighboring peaks with unique peptides + int neighboringPeaksCount = _results.Peaks[file] + .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) + .Select(peak => peak.Identifications.First().ModifiedSequence) + .Distinct() + .Count(); + if (neighboringPeaksCount > neighboringPeakCountMax) + neighboringPeakCountMax = neighboringPeaksCount; + } + + if(neighboringPeakCountMax > bestDonorNeighborPeakCount) + { + bestDonor = file; + bestDonorNeighborPeakCount = neighboringPeakCountMax; + } + } + if(DonorFileToModSeqDict.ContainsKey(bestDonor)) + { + DonorFileToModSeqDict[bestDonor].Add(modSeq); + } + else + { + DonorFileToModSeqDict.Add(bestDonor, new List { modSeq }); + } + } + } + /// /// This method maps identified peaks from other chromatographic runs ("donors") onto /// the defined chromatographic run ("acceptor"). The goal is to reduce the number of missing @@ -635,6 +686,18 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie /// private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { + // This was just to test the DonorFileToModSeqDict is constructed correctly + + //List < HashSet > seqSets = new(); + //foreach(var listofSeqs in DonorFileToModSeqDict.Values) + //{ + // seqSets.Add(new HashSet(listofSeqs)); + //} + //var interesect = seqSets[0].Intersect(seqSets[1]).Intersect(seqSets[2]); + //var onetTwo = seqSets[0].Intersect(seqSets[1]); + //var twoThre = seqSets[2].Intersect(seqSets[1]); + //var oneThree = seqSets[0].Intersect(seqSets[2]); + bool acceptorSampleIsFractionated = _results.SpectraFiles .Where(p => p.Condition == idAcceptorFile.Condition && p.BiologicalReplicate == idAcceptorFile.BiologicalReplicate) .Select(p => p.Fraction) @@ -653,7 +716,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) if (scorer == null) return; - mbrTol = new PpmTolerance(50); + //mbrTol = new PpmTolerance(50); // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier _peakIndexingEngine.DeserializeIndex(idAcceptorFile); @@ -676,6 +739,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) Random randomGenerator = new Random(); //ConcurrentDictionary acceptorPeakDecoyPeakDict = new(); + + // New Approach - one donor file per peptide + // map each donor file onto this file foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) { @@ -683,12 +749,21 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { continue; } + if (!DonorFileToModSeqDict.TryGetValue(idDonorFile, out var sequencesInDonor)) + { + continue; + } + else if (!sequencesInDonor.Any()) + { + continue; + } // this is the list of peaks identified in the other file but not in this one ("ID donor peaks") List idDonorPeaks = _results.Peaks[idDonorFile].Where(p => !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1 && p.IsotopicEnvelopes.Any() + && sequencesInDonor.Contains(p.Identifications.First().ModifiedSequence) && !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index 25cfc2f58..0250aaad3 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -16,7 +16,7 @@ using ChromatographicPeak = FlashLFQ.ChromatographicPeak; using Stopwatch = System.Diagnostics.Stopwatch; using Peptide = Proteomics.AminoAcidPolymer.Peptide; - +using System.Windows.Shapes; namespace Test { @@ -180,5 +180,191 @@ public static void TwoFileMbrTest() } } + + [Test] + // This is gonna have a bunch of local file references, just a heads up. Dont make github try and build this one + public static void ThreeFileMbrTest() + { + //string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\subset_psms.psmtsv"; + string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\AllPSMs_1PercentFdr.psmtsv"; + string psmFile2 = @"D:\SingleCellDataSets\Organoid\Search_MM_320\Task1-SearchTask\Individual File Results\HFL1SC_Unhealthy_CH2_J7-calib_PSMs.psmtsv"; + + SpectraFileInfo j5 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J5.raw", "a", 0, 0, 0); + SpectraFileInfo j6 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J6.raw", "a", 1, 0, 0); + SpectraFileInfo j7 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\Calibration_MM_320\Task1-CalibrateTask\HFL1SC_Unhealthy_CH2_J7-calib.mzML", "a", 2, 0, 0); + + + List ids = new List(); + Dictionary allProteinGroups = new Dictionary(); + foreach (string line in File.ReadAllLines(psmFile)) + { + var split = line.Split(new char[] { '\t' }); + + if (split.Contains("File Name") || string.IsNullOrWhiteSpace(line)) + { + continue; + } + + SpectraFileInfo file = null; + + if (split[0].Contains("J5")) + { + file = j5; + } + else if (split[0].Contains("J6")) + { + file = j6; + } + + string baseSequence = split[12]; + string fullSequence = split[13]; + double monoMass = double.Parse(split[22]); + double rt = double.Parse(split[2]); + int z = (int)double.Parse(split[6]); + var proteins = split[25].Split(new char[] { '|' }); + List proteinGroups = new List(); + foreach (var protein in proteins) + { + if (allProteinGroups.TryGetValue(protein, out var proteinGroup)) + { + proteinGroups.Add(proteinGroup); + } + else + { + allProteinGroups.Add(protein, new ProteinGroup(protein, "", "")); + proteinGroups.Add(allProteinGroups[protein]); + } + } + + bool isDecoy = split[32] == "Y"; + + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy); + ids.Add(id); + } + foreach(string line in File.ReadAllLines(psmFile2)) + { + var split = line.Split(new char[] { '\t' }); + + if (split.Contains("File Name") || string.IsNullOrWhiteSpace(line)) + { + continue; + } + + SpectraFileInfo file = j7; + + double qval = Double.Parse(split[50]); + if (qval > 0.01) continue; + + string baseSequence = split[12]; + string fullSequence = split[13]; + if(!double.TryParse(split[22], out var x)) + { + continue; // Occurs for ambiguous peptides + } + double monoMass = double.Parse(split[22]); + double rt = double.Parse(split[2]); + int z = (int)double.Parse(split[6]); + var proteins = split[25].Split(new char[] { '|' }); + List proteinGroups = new List(); + foreach (var protein in proteins) + { + if (allProteinGroups.TryGetValue(protein, out var proteinGroup)) + { + proteinGroups.Add(proteinGroup); + } + else + { + allProteinGroups.Add(protein, new ProteinGroup(protein, "", "")); + proteinGroups.Add(allProteinGroups[protein]); + } + } + + bool isDecoy = split[32] == "Y"; + + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy); + ids.Add(id); + } + + + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); + var results = engine.Run(); + + var test = results.Peaks.Values.SelectMany(peakList => peakList).ToList(); + int place = 0; + + List mbrPeaks = new(); + + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.RandomRt && !peak.DecoyPeptide).ToList()); + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide && !peak.RandomRt).ToList()); + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide && peak.RandomRt).ToList()); + mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); + + + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_minRtDiff.tsv")) + { + writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + foreach (var peak in mbrPeaks) + { + writer.WriteLine(peak); + } + } + + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) + { + writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + foreach (var peak in engine.DecoyPeaks) + { + writer.WriteLine(peak); + } + } + + var f1r1MbrResults = results + .PeptideModifiedSequences + .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MBR && p.Value.GetDetectionType(j6) == DetectionType.MSMS).ToList(); + + Assert.That(f1r1MbrResults.Count >= 132); + + var f1r2MbrResults = results.PeptideModifiedSequences + .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MSMS && p.Value.GetDetectionType(j6) == DetectionType.MBR).ToList(); + + Assert.That(f1r2MbrResults.Count >= 77); + + List<(double, double)> peptideIntensities = new List<(double, double)>(); + + foreach (var peptide in f1r1MbrResults) + { + double mbrIntensity = Math.Log(peptide.Value.GetIntensity(j5)); + double msmsIntensity = Math.Log(peptide.Value.GetIntensity(j6)); + peptideIntensities.Add((mbrIntensity, msmsIntensity)); + } + + double corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); + Assert.Greater(corr, 0.8); + + peptideIntensities.Clear(); + foreach (var peptide in f1r2MbrResults) + { + double mbrIntensity = Math.Log(peptide.Value.GetIntensity(j6)); + double msmsIntensity = Math.Log(peptide.Value.GetIntensity(j5)); + peptideIntensities.Add((mbrIntensity, msmsIntensity)); + } + + corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); + + Assert.That(corr > 0.7); + + // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein + // has to be observed in a condition for match-between-runs + j5.Condition = "b"; + engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 5); + results = engine.Run(); + var proteinsObservedInF1 = ids.Where(p => p.FileInfo == j5).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF2 = ids.Where(p => p.FileInfo == j6).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF1ButNotF2 = proteinsObservedInF1.Except(proteinsObservedInF2).ToList(); + foreach (ProteinGroup protein in proteinsObservedInF1ButNotF2) + { + Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(j6) == 0); + } + } } } diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 387d4aa71..afe4bf50c 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 5.0.203 + 5.1.3 mzLib Stef S. Stef S. From 4a9149cf108adaf2690058eae3b83452389a1a31 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 12 Feb 2024 17:36:56 -0600 Subject: [PATCH 20/55] Peak Organism fix --- mzLib/FlashLFQ/ChromatographicPeak.cs | 3 +++ mzLib/FlashLFQ/FlashLfqEngine.cs | 2 +- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 20 ++++++++++---------- mzLib/mzLib.nuspec | 2 +- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index c0490ba65..067308e8b 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -69,6 +69,7 @@ public static string TabSeparatedHeader sb.Append("Base Sequence" + "\t"); sb.Append("Full Sequence" + "\t"); sb.Append("Protein Group" + "\t"); + sb.Append("Organism" + '\t'); sb.Append("Peptide Monoisotopic Mass" + "\t"); sb.Append("MS2 Retention Time" + "\t"); sb.Append("Precursor Charge" + "\t"); @@ -211,10 +212,12 @@ public override string ToString() if (t.Any()) { sb.Append(string.Join(";", t) + '\t'); + sb.Append(string.Join(";", Identifications.SelectMany(id => id.ProteinGroups).Select(p => p.Organism).Distinct()) + '\t'); } else { sb.Append("" + '\t'); + sb.Append("" + '\t'); } sb.Append("" + Identifications.First().MonoisotopicMass + '\t'); diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 9d0aed9b7..01a992cb4 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -653,7 +653,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) if (scorer == null) return; - mbrTol = new PpmTolerance(50); + //mbrTol = new PpmTolerance(50); // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier _peakIndexingEngine.DeserializeIndex(idAcceptorFile); diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index 25cfc2f58..05ae76259 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -87,7 +87,7 @@ public static void TwoFileMbrTest() } else { - allProteinGroups.Add(protein, new ProteinGroup(protein, "", "")); + allProteinGroups.Add(protein, new ProteinGroup(protein, "", "Homo Sapiens")); proteinGroups.Add(allProteinGroups[protein]); } } @@ -113,7 +113,7 @@ public static void TwoFileMbrTest() mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_minRtDiff.tsv")) + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_orgTest.tsv")) { writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); foreach (var peak in mbrPeaks) @@ -122,14 +122,14 @@ public static void TwoFileMbrTest() } } - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) - { - writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - foreach (var peak in engine.DecoyPeaks) - { - writer.WriteLine(peak); - } - } + //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) + //{ + // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + // foreach (var peak in engine.DecoyPeaks) + // { + // writer.WriteLine(peak); + // } + //} var f1r1MbrResults = results .PeptideModifiedSequences diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 387d4aa71..6c7bb6692 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 5.0.203 + 5.2.0 mzLib Stef S. Stef S. From 61fac1cba11482b121aea8e9fdeb5190af80efd3 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 15 Feb 2024 01:01:23 -0600 Subject: [PATCH 21/55] Basically works, QValue still underestimates --- mzLib/FlashLFQ/ChromatographicPeak.cs | 1 + mzLib/FlashLFQ/FlashLfqEngine.cs | 217 +++++++++++------------ mzLib/FlashLFQ/Identification.cs | 6 +- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 30 ++-- mzLib/mzLib.nuspec | 2 +- 5 files changed, 126 insertions(+), 130 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 01ba90d0e..1ab123c75 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -22,6 +22,7 @@ public class ChromatographicPeak public double IntensityScore { get; set; } public double RtScore { get; set; } public double ScanCountScore { get; set; } + public List ChargeList { get; set; } public string Collision { get; set; } public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 7a6fce5b0..ad798f4f2 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -36,10 +36,22 @@ public class FlashLfqEngine // MBR settings public readonly bool MatchBetweenRuns; - public readonly double MbrRtWindow; public readonly double MbrPpmTolerance; + + // New MBR Settings + public readonly double RtWindowIncrease = 1.0; public readonly double MbrAlignmentWindow = 0.5; + //public readonly double? MbrPpmTolerance; + /// + /// Specifies how the donor peak for MBR is selected. + /// 'S' selects the donor peak associated with the highest scoring PSM + /// 'I' selects the donor peak with the max intensity + /// 'N' selects the donor peak with the most neighboring peaks + /// + public char DonorCriterion { get; init; } + public readonly double DonorQValueThreshold; + public readonly bool RequireMsmsIdInCondition; // settings for the Bayesian protein quantification engine @@ -68,8 +80,9 @@ public class FlashLfqEngine private FlashLfqResults _results; internal Dictionary _ms1Scans; internal PeakIndexingEngine _peakIndexingEngine; - internal Dictionary> DonorFileToModSeqDict { get; private set; } + internal Dictionary> DonorFileToPeakDict { get; private set; } internal ConcurrentBag DecoyPeaks { get; private set; } + public FlashLfqEngine( List allIdentifications, @@ -97,7 +110,9 @@ public FlashLfqEngine( int mcmcBurninSteps = 1000, bool useSharedPeptidesForProteinQuant = false, bool pairedSamples = false, - int? randomSeed = null) + int? randomSeed = null, + char donorCriterion = 'I', + double donorQValueThreshold = 0.05) { Loaders.LoadElements(); @@ -126,6 +141,9 @@ public FlashLfqEngine( Silent = silent; IdSpecificChargeState = idSpecificChargeState; MbrRtWindow = maxMbrWindow; + DonorCriterion = donorCriterion; + DonorQValueThreshold = donorQValueThreshold; + RequireMsmsIdInCondition = requireMsmsIdInCondition; Normalize = normalize; MaxThreads = maxThreads; @@ -197,6 +215,7 @@ public FlashLfqResults Run() // do MBR if (MatchBetweenRuns) { + Console.WriteLine("Find the best donors for match-between-runs"); FindPeptideDonorFiles(); foreach (var spectraFile in _spectraFileInfo) { @@ -583,7 +602,9 @@ internal RtInfo PredictRetentionTime( } //TODO: Expand range and see what happens - rtRange = Math.Min(rtRange+1, MbrRtWindow+1); + rtRange = Math.Min(rtRange+RtWindowIncrease, MbrRtWindow+RtWindowIncrease); + + //rtRange = Math.Min(rtRange, MbrRtWindow); return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); } @@ -632,47 +653,69 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie /// /// For every MSMS identified peptide, selects one file that will be used as the donor /// by finding files that contain the most peaks in the local neighborhood, - /// then writes the restults to the DonorFileToIdsDict + /// then writes the restults to the DonorFileToIdsDict. + /// WARNING! Strong assumption that this is called BEFORE MBR peaks are identified/assigned to the results /// private void FindPeptideDonorFiles() { - DonorFileToModSeqDict = new Dictionary>(); + DonorFileToPeakDict = new Dictionary>(); + + Dictionary> seqPeakDict = new(); + seqPeakDict = _results.Peaks.SelectMany(kvp => kvp.Value) + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < 0.05) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .ToDictionary(group => group.Key, group => group.ToList()); // iterate through each unique sequence - foreach (string modSeq in _modifiedSequenceToIsotopicDistribution.Keys) + foreach(var sequencePeakListKvp in seqPeakDict) { - SpectraFileInfo bestDonor = null; - int bestDonorNeighborPeakCount = 0; - foreach(SpectraFileInfo file in _spectraFileInfo) - { - var peaksForPeptide = _results.Peaks[file].Where(peak => - peak.Identifications.Any(id => id.ModifiedSequence.Equals(modSeq))); - int neighboringPeakCountMax = 0; - foreach(var donorPeak in peaksForPeptide) - { - // Count the number of neighboring peaks with unique peptides - int neighboringPeaksCount = _results.Peaks[file] - .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) - .Select(peak => peak.Identifications.First().ModifiedSequence) - .Distinct() - .Count(); - if (neighboringPeaksCount > neighboringPeakCountMax) - neighboringPeakCountMax = neighboringPeaksCount; - } + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; - if(neighboringPeakCountMax > bestDonorNeighborPeakCount) - { - bestDonor = file; - bestDonorNeighborPeakCount = neighboringPeakCountMax; - } + ChromatographicPeak bestPeak = null; + switch (DonorCriterion) + { + case 'S': // Select best peak by the PSM score + bestPeak = peaksForPeptide.MaxBy(peak => peak.Identifications.First().PsmScore); + if (bestPeak.Identifications.First().PsmScore > 0) + break; + else // if every ID has a score of zero, let it fall through to the default case + goto default; + case 'N': // Select peak with the most neighboring peaks + int maxPeaks = 0; + foreach (var donorPeak in peaksForPeptide) + { + // Count the number of neighboring peaks with unique peptides + int neighboringPeaksCount = _results.Peaks[donorPeak.SpectraFileInfo] + .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) + .Select(peak => peak.Identifications.First().ModifiedSequence) + .Distinct() + .Count(); + + if (neighboringPeaksCount > maxPeaks) + { + maxPeaks = neighboringPeaksCount; + bestPeak = donorPeak; + } + } + break; + case 'I': // Select the peak with the highest intensity + default: + bestPeak = peaksForPeptide.MaxBy(peak => peak.Intensity); + break; } - if(DonorFileToModSeqDict.ContainsKey(bestDonor)) + + if (bestPeak == null) continue; + if (DonorFileToPeakDict.ContainsKey(bestPeak.SpectraFileInfo)) { - DonorFileToModSeqDict[bestDonor].Add(modSeq); + DonorFileToPeakDict[bestPeak.SpectraFileInfo].Add(bestPeak); } else { - DonorFileToModSeqDict.Add(bestDonor, new List { modSeq }); + DonorFileToPeakDict.Add(bestPeak.SpectraFileInfo, new List { bestPeak }); } } } @@ -686,17 +729,6 @@ private void FindPeptideDonorFiles() /// private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { - // This was just to test the DonorFileToModSeqDict is constructed correctly - - //List < HashSet > seqSets = new(); - //foreach(var listofSeqs in DonorFileToModSeqDict.Values) - //{ - // seqSets.Add(new HashSet(listofSeqs)); - //} - //var interesect = seqSets[0].Intersect(seqSets[1]).Intersect(seqSets[2]); - //var onetTwo = seqSets[0].Intersect(seqSets[1]); - //var twoThre = seqSets[2].Intersect(seqSets[1]); - //var oneThree = seqSets[0].Intersect(seqSets[2]); bool acceptorSampleIsFractionated = _results.SpectraFiles .Where(p => p.Condition == idAcceptorFile.Condition && p.BiologicalReplicate == idAcceptorFile.BiologicalReplicate) @@ -709,14 +741,14 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // these are the analytes already identified in this run. we don't need to try to match them from other runs var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks - .Where(p => p.IsotopicEnvelopes.Any()) + .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol); if (scorer == null) return; - //mbrTol = new PpmTolerance(50); + mbrTol = new PpmTolerance(MbrPpmTolerance); // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier _peakIndexingEngine.DeserializeIndex(idAcceptorFile); @@ -737,35 +769,19 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // this stores the results of MBR ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); Random randomGenerator = new Random(); - //ConcurrentDictionary acceptorPeakDecoyPeakDict = new(); - - - // New Approach - one donor file per peptide // map each donor file onto this file - foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) + foreach (var donorFilePeakListKvp in DonorFileToPeakDict) { - if (idAcceptorFile.Equals(idDonorFile)) - { - continue; - } - if (!DonorFileToModSeqDict.TryGetValue(idDonorFile, out var sequencesInDonor)) - { - continue; - } - else if (!sequencesInDonor.Any()) + if (idAcceptorFile.Equals(donorFilePeakListKvp.Key)) { continue; } // this is the list of peaks identified in the other file but not in this one ("ID donor peaks") - List idDonorPeaks = _results.Peaks[idDonorFile].Where(p => - !p.IsMbrPeak - && p.NumIdentificationsByFullSeq == 1 - && p.IsotopicEnvelopes.Any() - && sequencesInDonor.Contains(p.Identifications.First().ModifiedSequence) - && !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) - && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); + List idDonorPeaks = donorFilePeakListKvp.Value + .Where(p => !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) + && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); if (!idDonorPeaks.Any()) { @@ -773,7 +789,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } bool donorSampleIsFractionated = _results.SpectraFiles - .Where(p => p.Condition == idDonorFile.Condition && p.BiologicalReplicate == idDonorFile.BiologicalReplicate) + .Where(p => p.Condition == donorFilePeakListKvp.Key.Condition && p.BiologicalReplicate == donorFilePeakListKvp.Key.BiologicalReplicate) .Select(p => p.Fraction) .Distinct() .Count() > 1; @@ -781,13 +797,13 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // We're only interested in the fold change if the conditions are different. Otherwise, we score based off of the intensities // of the acceptor file if (_spectraFileInfo.Select(p => p.Condition).Distinct().Count() > 1 - && idDonorFile.Condition != idAcceptorFile.Condition) + && donorFilePeakListKvp.Key.Condition != idAcceptorFile.Condition) { scorer.CalculateFoldChangeBetweenFiles(idDonorPeaks); } // generate RT calibration curve - RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile); + RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile); // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, idDonorPeaks.Count), @@ -829,28 +845,10 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { DecoyPeaks.Add(bestDecoy); } - //acceptorPeakDecoyPeakDict.TryAdd(donorPeak.Identifications.First(), (bestAcceptor, bestDecoy)); } }); } - //var test = acceptorPeakDecoyPeakDict.Where(kvp => kvp.Value.decoy != null).ToList(); - //if(test.Count > 100) - //{ - // using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrTargetDecoy_WideTolerance_wideRt.tsv")) - // { - // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - // foreach (var pair in test) - // { - // if (pair.Value.target != null) - // writer.WriteLine(pair.Value.target.ToString()); - // else - // writer.WriteLine(""); - // writer.WriteLine(pair.Value.decoy.ToString()); - // } - // } - //} - // If we have multiple identification with the same sequence mapped to the same peak, we want to sum their MBR scores // This is done here foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) @@ -860,25 +858,17 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) { - List collapsedPeaks = new(); - foreach(var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) + List bestPeaks = new(); + foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) { - //var scoreSum = peakGroup.Sum(peak => peak.MbrScore); - var idList = peakGroup.Select(peak => peak.Identifications.First()).Distinct().ToList(); // This is fine, because each mbrPeak only has one identification - var collapsedPeak = peakGroup.OrderBy(peak => peak.MbrScore).First(); - //collapsedPeak.MbrScore = peakGroup.Max(peak => peak.MbrScore); - // Lmao, these three lines are problematic. Should probably do something about them - collapsedPeak.Identifications.Clear(); - collapsedPeak.Identifications.AddRange(idList); - collapsedPeak.ResolveIdentifications(); - collapsedPeaks.Add(collapsedPeak); + bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); } envelopePeakListKvp.Value.Clear(); - envelopePeakListKvp.Value.AddRange(collapsedPeaks); + envelopePeakListKvp.Value.AddRange(bestPeaks); } } - + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) @@ -912,18 +902,8 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } _results.Peaks[idAcceptorFile].Add(best); - } - //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\MbrAllMbrPeaks.tsv")) - //{ - // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - // foreach (var peak in _results.Peaks[idAcceptorFile].Where(peak => peak.IsMbrPeak)) - // { - // writer.WriteLine(peak.ToString()); - // } - //} - RunErrorChecking(idAcceptorFile); } @@ -976,7 +956,7 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - double rtVariance = Math.Min((rtInfo.Width - 1) / 6, 0.05); // Minimum standard deviation of 3 seconds + double rtVariance = Math.Min(rtInfo.Width - RtWindowIncrease / 6, 0.05); // Minimum standard deviation of 3 seconds Normal rtScoringDistribution = randomRt == null ? new Normal(rtInfo.PredictedRt, rtVariance) : new Normal((double)randomRt, rtVariance); bestAcceptor = null; @@ -1004,9 +984,11 @@ internal void FindAllAcceptorPeaks( if (acceptorPeak == null) continue; if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) + { + acceptorPeak.ChargeList = chargesToMatch; bestAcceptor = acceptorPeak; - //if (decoyRt != null) - // continue; // We don't want to store the decoys in mbrIdentifiedPeaks right now + } + // save the peak hypothesis matchBetweenRunsIdentifiedPeaks.AddOrUpdate @@ -1165,6 +1147,7 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) // merge duplicate peaks and handle MBR/MSMS peakfinding conflicts var errorCheckedPeaksGroupedByApex = new Dictionary(); var errorCheckedPeaks = new List(); + List decoyPeptidePeaks = new(); foreach (ChromatographicPeak tryPeak in _results.Peaks[spectraFile].OrderBy(p => p.IsMbrPeak)) { tryPeak.CalculateIntensityForThisFeature(Integrate); @@ -1194,6 +1177,10 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { + if(tryPeak.DecoyPeptide) + { + decoyPeptidePeaks.Add(tryPeak); + } continue; // Default to MSMS peaks over MBR Peaks } else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak) @@ -1236,6 +1223,8 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } errorCheckedPeaks.AddRange(errorCheckedPeaksGroupedByApex.Values.Where(p => p != null)); + errorCheckedPeaks.AddRange(decoyPeptidePeaks); + _results.Peaks[spectraFile] = errorCheckedPeaks; } diff --git a/mzLib/FlashLFQ/Identification.cs b/mzLib/FlashLFQ/Identification.cs index 6395000bf..4abee4105 100644 --- a/mzLib/FlashLFQ/Identification.cs +++ b/mzLib/FlashLFQ/Identification.cs @@ -16,13 +16,15 @@ public class Identification public readonly bool UseForProteinQuant; public double PeakfindingMass; public double PosteriorErrorProbability; + public double PsmScore { get; init; } + public double QValue { get; init; } public bool IsDecoy { get; } public Identification(SpectraFileInfo fileInfo, string BaseSequence, string ModifiedSequence, double monoisotopicMass, double ms2RetentionTimeInMinutes, int chargeState, List proteinGroups, ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true, double posteriorErrorProbability = 0, - bool decoy = false) + double psmScore = 0, double qValue = 0, bool decoy = false) { this.FileInfo = fileInfo; this.BaseSequence = BaseSequence; @@ -34,6 +36,8 @@ public Identification(SpectraFileInfo fileInfo, string BaseSequence, string Modi this.OptionalChemicalFormula = optionalChemicalFormula; UseForProteinQuant = useForProteinQuant; PosteriorErrorProbability = posteriorErrorProbability; + QValue = qValue; + PsmScore = psmScore; IsDecoy = decoy; } diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index a210599a0..322caf52c 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -93,13 +93,14 @@ public static void TwoFileMbrTest() } bool isDecoy = split[32] == "Y"; + double score = double.TryParse(split[9], out var s) ? s : 0; - Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy); + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy, psmScore: score); ids.Add(id); } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5, donorCriterion: 'S'); var results = engine.Run(); var test = results.Peaks.Values.SelectMany(peakList => peakList).ToList(); @@ -113,7 +114,7 @@ public static void TwoFileMbrTest() mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_orgTest.tsv")) + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_1PepTest.tsv")) { writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); foreach (var peak in mbrPeaks) @@ -280,13 +281,14 @@ public static void ThreeFileMbrTest() } bool isDecoy = split[32] == "Y"; + double score = double.TryParse(split[9], out var s) ? s : 0; - Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy); + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy, psmScore: score); ids.Add(id); } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5, donorCriterion: 'S'); var results = engine.Run(); var test = results.Peaks.Values.SelectMany(peakList => peakList).ToList(); @@ -300,7 +302,7 @@ public static void ThreeFileMbrTest() mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_minRtDiff.tsv")) + using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_1PeakPerPepScore.tsv")) { writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); foreach (var peak in mbrPeaks) @@ -309,14 +311,14 @@ public static void ThreeFileMbrTest() } } - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) - { - writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - foreach (var peak in engine.DecoyPeaks) - { - writer.WriteLine(peak); - } - } + //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) + //{ + // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + // foreach (var peak in engine.DecoyPeaks) + // { + // writer.WriteLine(peak); + // } + //} var f1r1MbrResults = results .PeptideModifiedSequences diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index afe4bf50c..6fd88aa54 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,8 +1,8 @@ + 5.2.9.6 mzLib - 5.1.3 mzLib Stef S. Stef S. From 363c457288b8ba5bd3d531041965d1cf321324d5 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 19 Feb 2024 10:44:34 -0600 Subject: [PATCH 22/55] Minor changes to peakfinding --- mzLib/FlashLFQ/FlashLfqEngine.cs | 14 ++++++++------ mzLib/mzLib.nuspec | 2 +- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index ad798f4f2..f8f8f58a1 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -664,7 +664,7 @@ private void FindPeptideDonorFiles() seqPeakDict = _results.Peaks.SelectMany(kvp => kvp.Value) .Where(peak => peak.NumIdentificationsByFullSeq == 1 && peak.IsotopicEnvelopes.Any() - && peak.Identifications.Min(id => id.QValue) < 0.05) + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) .GroupBy(peak => peak.Identifications.First().ModifiedSequence) .ToDictionary(group => group.Key, group => group.ToList()); @@ -901,6 +901,8 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } + if (best.Identifications.First().QValue >= 0.01 && !best.DecoyPeptide && !best.RandomRt) + continue; // don't accept MBR peaks based on low q ids, do accept decoys though _results.Peaks[idAcceptorFile].Add(best); } @@ -973,7 +975,7 @@ internal void FindAllAcceptorPeaks( if (!chargeXic.Any()) continue; - List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z); + List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z).OrderBy(env => env.Intensity).ToList(); // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. // remove the clustered isotopic envelopes from the list of seeds after each iteration @@ -1038,10 +1040,11 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( List chargeEnvelopes, bool randomRt = false) { - var donorId = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); + var donorId = donorPeak.Identifications.OrderBy(p => p.QValue).First(); var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, randomRt); - IsotopicEnvelope seedEnv = chargeEnvelopes.First(); + // Grab the first scan/envelope from charge envelopes. This should be the most intense envelope in the list + IsotopicEnvelope seedEnv = chargeEnvelopes.First(); var xic = Peakfind(seedEnv.IndexedPeak.RetentionTime, donorId.PeakfindingMass, z, idAcceptorFile, mbrTol); List bestChargeEnvelopes = GetIsotopicEnvelopes(xic, donorId, z); acceptorPeak.IsotopicEnvelopes.AddRange(bestChargeEnvelopes); @@ -1223,8 +1226,7 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } errorCheckedPeaks.AddRange(errorCheckedPeaksGroupedByApex.Values.Where(p => p != null)); - errorCheckedPeaks.AddRange(decoyPeptidePeaks); - + //errorCheckedPeaks.AddRange(decoyPeptidePeaks); _results.Peaks[spectraFile] = errorCheckedPeaks; } diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 6fd88aa54..abdacc63e 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.2.9.6 + 5.3.0.0 mzLib mzLib Stef S. From c4720ba383bce15b6e3ea7cab2eedf7ee8698855 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 29 Feb 2024 12:10:03 -0600 Subject: [PATCH 23/55] MSMS double checking --- mzLib/FlashLFQ/FlashLFQResults.cs | 18 + mzLib/FlashLFQ/FlashLfqEngine.cs | 392 ++++++++++++++-------- mzLib/FlashLFQ/IndexedMassSpectralPeak.cs | 1 + mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 5 + mzLib/mzLib.nuspec | 2 +- 5 files changed, 284 insertions(+), 134 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index c6218bc81..e6ae4d5aa 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -14,6 +14,7 @@ public class FlashLfqResults public readonly Dictionary PeptideModifiedSequences; public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; + public readonly Dictionary> DoubleCheckedPeaks; public IEnumerable DecoyPeaks { get; set; } public FlashLfqResults(List spectraFiles, List identifications) @@ -22,10 +23,12 @@ public FlashLfqResults(List spectraFiles, List PeptideModifiedSequences = new Dictionary(); ProteinGroups = new Dictionary(); Peaks = new Dictionary>(); + DoubleCheckedPeaks = new(); foreach (SpectraFileInfo file in spectraFiles) { Peaks.Add(file, new List()); + DoubleCheckedPeaks.Add(file, new List()); } foreach (Identification id in identifications) @@ -561,6 +564,21 @@ public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, st output.WriteLine(peak.ToString()); } } + + string[] pathSplit = peaksOutputPath.Split(Path.DirectorySeparatorChar); + pathSplit[^1] = "DoubleCheckedPeaks.tsv"; + + using (var output = new StreamWriter(String.Join(Path.DirectorySeparatorChar, pathSplit))) + { + output.WriteLine(ChromatographicPeak.TabSeparatedHeader); + + foreach (var peak in DoubleCheckedPeaks.SelectMany(p => p.Value) + .OrderBy(p => p.SpectraFileInfo.FilenameWithoutExtension) + .ThenByDescending(p => p.Collision)) + { + output.WriteLine(peak.ToString()); + } + } } if(decoyPath != null & DecoyPeaks.IsNotNullOrEmpty()) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index f8f8f58a1..97e64cfc6 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -12,6 +12,7 @@ using UsefulProteomicsDatabases; using System.Runtime.CompilerServices; using System.IO; +using Easy.Common.Extensions; [assembly: InternalsVisibleTo("TestFlashLFQ")] @@ -291,6 +292,7 @@ public PeakIndexingEngine GetIndexingEngine() { return _peakIndexingEngine; } + /// /// Creates a theoretical isotope distribution for each of the identified sequences /// If the sequence is modified and the modification has an unknown chemical formula, @@ -500,6 +502,136 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) _results.Peaks[fileInfo].AddRange(chromatographicPeaks.ToList()); } + /// + /// Used by the match-between-runs algorithm to determine systematic retention time drifts between + /// chromatographic runs. + /// + private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor) + { + var donorFileBestMsmsPeaks = new Dictionary(); + var acceptorFileBestMsmsPeaks = new Dictionary(); + var rtCalibrationCurve = new List(); + + // get all peaks, not counting ambiguous peaks + IEnumerable donorPeaks = _results.Peaks[donor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); + IEnumerable acceptorPeaks = _results.Peaks[acceptor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); + + // get the best (most intense) peak for each peptide in the acceptor file + foreach (ChromatographicPeak acceptorPeak in acceptorPeaks) + { + if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) + { + if (currentBestPeak.Intensity > acceptorPeak.Intensity) + { + acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; + } + } + else + { + acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); + } + } + + // get the best (most intense) peak for each peptide in the donor file + foreach (ChromatographicPeak donorPeak in donorPeaks) + { + if (donorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) + { + if (currentBestPeak.Intensity > donorPeak.Intensity) + { + donorFileBestMsmsPeaks[donorPeak.Identifications.First().ModifiedSequence] = donorPeak; + } + } + else + { + donorFileBestMsmsPeaks.Add(donorPeak.Identifications.First().ModifiedSequence, donorPeak); + } + } + + // create RT calibration curve + foreach (var peak in acceptorFileBestMsmsPeaks) + { + ChromatographicPeak acceptorFilePeak = peak.Value; + + if (donorFileBestMsmsPeaks.TryGetValue(peak.Key, out ChromatographicPeak donorFilePeak)) + { + rtCalibrationCurve.Add(new RetentionTimeCalibDataPoint(donorFilePeak, acceptorFilePeak)); + } + } + + return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); + } + + /// + /// For every MSMS identified peptide, selects one file that will be used as the donor + /// by finding files that contain the most peaks in the local neighborhood, + /// then writes the restults to the DonorFileToIdsDict. + /// WARNING! Strong assumption that this is called BEFORE MBR peaks are identified/assigned to the results + /// + private void FindPeptideDonorFiles() + { + DonorFileToPeakDict = new Dictionary>(); + + Dictionary> seqPeakDict = new(); + seqPeakDict = _results.Peaks.SelectMany(kvp => kvp.Value) + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .ToDictionary(group => group.Key, group => group.ToList()); + + // iterate through each unique sequence + foreach (var sequencePeakListKvp in seqPeakDict) + { + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = null; + switch (DonorCriterion) + { + case 'S': // Select best peak by the PSM score + bestPeak = peaksForPeptide.MaxBy(peak => peak.Identifications.First().PsmScore); + if (bestPeak.Identifications.First().PsmScore > 0) + break; + else // if every ID has a score of zero, let it fall through to the default case + goto default; + case 'N': // Select peak with the most neighboring peaks + int maxPeaks = 0; + foreach (var donorPeak in peaksForPeptide) + { + // Count the number of neighboring peaks with unique peptides + int neighboringPeaksCount = _results.Peaks[donorPeak.SpectraFileInfo] + .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) + .Select(peak => peak.Identifications.First().ModifiedSequence) + .Distinct() + .Count(); + + if (neighboringPeaksCount > maxPeaks) + { + maxPeaks = neighboringPeaksCount; + bestPeak = donorPeak; + } + } + break; + case 'I': // Select the peak with the highest intensity + default: + bestPeak = peaksForPeptide.MaxBy(peak => peak.Intensity); + break; + } + + if (bestPeak == null) continue; + if (DonorFileToPeakDict.ContainsKey(bestPeak.SpectraFileInfo)) + { + DonorFileToPeakDict[bestPeak.SpectraFileInfo].Add(bestPeak); + } + else + { + DonorFileToPeakDict.Add(bestPeak.SpectraFileInfo, new List { bestPeak }); + } + } + } + /// /// Used by MBR. Predicts the retention time of a peak in an acceptor file based on the /// retention time of the peak in the donor file. This is done with a local alignment @@ -650,76 +782,6 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, logIntensityDistribution); } - /// - /// For every MSMS identified peptide, selects one file that will be used as the donor - /// by finding files that contain the most peaks in the local neighborhood, - /// then writes the restults to the DonorFileToIdsDict. - /// WARNING! Strong assumption that this is called BEFORE MBR peaks are identified/assigned to the results - /// - private void FindPeptideDonorFiles() - { - DonorFileToPeakDict = new Dictionary>(); - - Dictionary> seqPeakDict = new(); - seqPeakDict = _results.Peaks.SelectMany(kvp => kvp.Value) - .Where(peak => peak.NumIdentificationsByFullSeq == 1 - && peak.IsotopicEnvelopes.Any() - && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) - .GroupBy(peak => peak.Identifications.First().ModifiedSequence) - .ToDictionary(group => group.Key, group => group.ToList()); - - // iterate through each unique sequence - foreach(var sequencePeakListKvp in seqPeakDict) - { - List peaksForPeptide = sequencePeakListKvp.Value; - if (!peaksForPeptide.Any()) - continue; - - ChromatographicPeak bestPeak = null; - switch (DonorCriterion) - { - case 'S': // Select best peak by the PSM score - bestPeak = peaksForPeptide.MaxBy(peak => peak.Identifications.First().PsmScore); - if (bestPeak.Identifications.First().PsmScore > 0) - break; - else // if every ID has a score of zero, let it fall through to the default case - goto default; - case 'N': // Select peak with the most neighboring peaks - int maxPeaks = 0; - foreach (var donorPeak in peaksForPeptide) - { - // Count the number of neighboring peaks with unique peptides - int neighboringPeaksCount = _results.Peaks[donorPeak.SpectraFileInfo] - .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) - .Select(peak => peak.Identifications.First().ModifiedSequence) - .Distinct() - .Count(); - - if (neighboringPeaksCount > maxPeaks) - { - maxPeaks = neighboringPeaksCount; - bestPeak = donorPeak; - } - } - break; - case 'I': // Select the peak with the highest intensity - default: - bestPeak = peaksForPeptide.MaxBy(peak => peak.Intensity); - break; - } - - if (bestPeak == null) continue; - if (DonorFileToPeakDict.ContainsKey(bestPeak.SpectraFileInfo)) - { - DonorFileToPeakDict[bestPeak.SpectraFileInfo].Add(bestPeak); - } - else - { - DonorFileToPeakDict.Add(bestPeak.SpectraFileInfo, new List { bestPeak }); - } - } - } - /// /// This method maps identified peaks from other chromatographic runs ("donors") onto /// the defined chromatographic run ("acceptor"). The goal is to reduce the number of missing @@ -770,6 +832,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); Random randomGenerator = new Random(); + // This stores the results of a check where we examine whether MBR can return the same peak as the MSMS peak + ConcurrentDictionary>> doubleCheckPeaks = new(); + // map each donor file onto this file foreach (var donorFilePeakListKvp in DonorFileToPeakDict) { @@ -847,14 +912,43 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } }); + + // List of donor peaks where the peptide WAS identified in the acceptor file and the best donor is a different file. + // This will be used for checking the error rate + // For each sequence, we only select one peak corresponding to the PSM with the lowest q-value + List donorPeaksWithMsms = donorFilePeakListKvp.Value + .Where(p => acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) + && p.SpectraFileInfo != idAcceptorFile + && !p.Identifications.First().IsDecoy ) + .ToList(); + + // Loop through every MSMS id in the donor file + Parallel.ForEach(Partitioner.Create(0, donorPeaksWithMsms.Count), + new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, + (range, loopState) => + { + + for (int i = range.Item1; i < range.Item2; i++) + { + ChromatographicPeak donorPeak = donorPeaksWithMsms[i]; + // TODO: Add a toggle that set rtRange to be maximum width + RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + if (rtInfo == null) continue; + + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, doubleCheckPeaks, out var bestAcceptor); + if(bestAcceptor == null) + { + doubleCheckPeaks.TryAdd( + key: donorPeak.Identifications.First().ModifiedSequence, + value: null); + } + } + }); } - // If we have multiple identification with the same sequence mapped to the same peak, we want to sum their MBR scores - // This is done here foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) { // Each isotopic envelope is linked to a list of ChromatographicPeaks - // If multiple chromatographic peaks are linked, each with the same peptide identification, then their mbr scores are summed // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) { @@ -868,8 +962,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } - - // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -906,6 +998,98 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) _results.Peaks[idAcceptorFile].Add(best); } + // repeat the above procedures for the doubleCheckPeaks + foreach (var seqDictionaryKvp in doubleCheckPeaks.Where(kvp => kvp.Value != null)) + { + // Each isotopic envelope is linked to a list of ChromatographicPeaks + // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. + foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) + { + List bestPeaks = new(); + foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) + { + bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); + } + envelopePeakListKvp.Value.Clear(); + envelopePeakListKvp.Value.AddRange(bestPeaks); + } + } + + // take the best result (highest scoring) for each peptide after we've matched from all the donor files + foreach (var mbrIdentifiedPeptide in doubleCheckPeaks.Where(p => acceptorFileIdentifiedSequences.Contains(p.Key))) + { + string peptideModifiedSequence = mbrIdentifiedPeptide.Key; + List msmsPeaks = acceptorFileIdentifiedPeaks + .Where(peak => peak.Apex != null && peak.Identifications.First().ModifiedSequence.Equals(peptideModifiedSequence)).ToList(); + if (!msmsPeaks.Any()) continue; //This shouldn't happen + if (mbrIdentifiedPeptide.Value == null) + { + ChromatographicPeak nullPeak = new ChromatographicPeak(msmsPeaks.First().Identifications.First(), isMbrPeak: false, idAcceptorFile); + nullPeak.Collision = "0"; // Zero represent peak not found + _results.DoubleCheckedPeaks[idAcceptorFile].Add(nullPeak); + continue; + } + + List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); + ChromatographicPeak best = peakHypotheses.First(); + peakHypotheses.Remove(best); + + if (peakHypotheses.Count > 0) + { + double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); + double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); + + List peaksToRemoveFromHypotheses = new List(); + foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) + { + if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) + { + best.MergeFeatureWith(peak, Integrate); + + peaksToRemoveFromHypotheses.Add(peak); + } + } + } + + if(best == null || best.Apex == null) + { + ChromatographicPeak nullPeak = new ChromatographicPeak(msmsPeaks.First().Identifications.First(), isMbrPeak: false, idAcceptorFile); + nullPeak.Collision = "0"; // Zero represent peak not found + _results.DoubleCheckedPeaks[idAcceptorFile].Add(nullPeak); + continue; + } + + if (msmsPeaks.Any(peak => peak.Apex.Equals(best.Apex))) + { + best.Collision = "1"; // One is best possible + } + else + { + var test = msmsPeaks.Where(peak => Math.Abs(peak.Apex.IndexedPeak.RetentionTime - best.Apex.IndexedPeak.RetentionTime) < 0.0001); + if(test.IsNotNullOrEmpty()) + { + best.Collision = "2"; // Assumed same time, different charge state + } + else + { + test = msmsPeaks.Where(peak => + { + var rts = peak.IsotopicEnvelopes.Select(e => e.IndexedPeak.RetentionTime); + return best.ApexRetentionTime >= rts.Minimum() && best.ApexRetentionTime <= rts.Maximum(); + }); + if (test.IsNotNullOrEmpty()) + { + best.Collision = "3"; // Overlap peak + } + else + { + best.Collision = "-1"; + } + } + } + _results.DoubleCheckedPeaks[idAcceptorFile].Add(best); + } + RunErrorChecking(idAcceptorFile); } @@ -1072,65 +1256,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return acceptorPeak; } - /// - /// Used by the match-between-runs algorithm to determine systematic retention time drifts between - /// chromatographic runs. - /// - private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor) - { - var donorFileBestMsmsPeaks = new Dictionary(); - var acceptorFileBestMsmsPeaks = new Dictionary(); - var rtCalibrationCurve = new List(); - - // get all peaks, not counting ambiguous peaks - IEnumerable donorPeaks = _results.Peaks[donor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - IEnumerable acceptorPeaks = _results.Peaks[acceptor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - - // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in acceptorPeaks) - { - if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > acceptorPeak.Intensity) - { - acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; - } - } - else - { - acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); - } - } - - // get the best (most intense) peak for each peptide in the donor file - foreach (ChromatographicPeak donorPeak in donorPeaks) - { - if (donorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > donorPeak.Intensity) - { - donorFileBestMsmsPeaks[donorPeak.Identifications.First().ModifiedSequence] = donorPeak; - } - } - else - { - donorFileBestMsmsPeaks.Add(donorPeak.Identifications.First().ModifiedSequence, donorPeak); - } - } - - // create RT calibration curve - foreach (var peak in acceptorFileBestMsmsPeaks) - { - ChromatographicPeak acceptorFilePeak = peak.Value; - - if (donorFileBestMsmsPeaks.TryGetValue(peak.Key, out ChromatographicPeak donorFilePeak)) - { - rtCalibrationCurve.Add(new RetentionTimeCalibDataPoint(donorFilePeak, acceptorFilePeak)); - } - } - return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); - } /// /// Checks for and resolves situations where one IndexedMassSpectralPeak is defined as the apex diff --git a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs index c9aa89042..eb76d54c4 100644 --- a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs +++ b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs @@ -23,6 +23,7 @@ public override bool Equals(object obj) var otherPeak = (IndexedMassSpectralPeak)obj; return otherPeak != null + //&& Math.Abs(otherPeak.Mz - this.Mz) < 0.00000001 && otherPeak.Mz == this.Mz && otherPeak.ZeroBasedMs1ScanIndex == this.ZeroBasedMs1ScanIndex; } diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index 322caf52c..13a1e3962 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -132,12 +132,17 @@ public static void TwoFileMbrTest() // } //} + var collisionDict = results.DoubleCheckedPeaks.SelectMany(kvp => kvp.Value).GroupBy(peak => peak.Collision) + .ToDictionary(g => g.Key, g => g.Count()); + var f1r1MbrResults = results .PeptideModifiedSequences .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MBR && p.Value.GetDetectionType(j6) == DetectionType.MSMS).ToList(); Assert.That(f1r1MbrResults.Count >= 132); + results.WriteResults(peaksOutputPath: @"C:\Users\Alex\Desktop\FlashTest\AllPeaks.tsv", null, null, null, true); + var f1r2MbrResults = results.PeptideModifiedSequences .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MSMS && p.Value.GetDetectionType(j6) == DetectionType.MBR).ToList(); diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index abdacc63e..ae12c509d 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.0.0 + 5.3.0.7 mzLib mzLib Stef S. From 9ade14a2532b81f9b1d17aa6abf87388b5df741d Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 29 Feb 2024 12:14:09 -0600 Subject: [PATCH 24/55] Removed msms double checking --- mzLib/FlashLFQ/FlashLFQResults.cs | 18 ---- mzLib/FlashLFQ/FlashLfqEngine.cs | 128 +---------------------- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 3 - 3 files changed, 2 insertions(+), 147 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index e6ae4d5aa..c6218bc81 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -14,7 +14,6 @@ public class FlashLfqResults public readonly Dictionary PeptideModifiedSequences; public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; - public readonly Dictionary> DoubleCheckedPeaks; public IEnumerable DecoyPeaks { get; set; } public FlashLfqResults(List spectraFiles, List identifications) @@ -23,12 +22,10 @@ public FlashLfqResults(List spectraFiles, List PeptideModifiedSequences = new Dictionary(); ProteinGroups = new Dictionary(); Peaks = new Dictionary>(); - DoubleCheckedPeaks = new(); foreach (SpectraFileInfo file in spectraFiles) { Peaks.Add(file, new List()); - DoubleCheckedPeaks.Add(file, new List()); } foreach (Identification id in identifications) @@ -564,21 +561,6 @@ public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, st output.WriteLine(peak.ToString()); } } - - string[] pathSplit = peaksOutputPath.Split(Path.DirectorySeparatorChar); - pathSplit[^1] = "DoubleCheckedPeaks.tsv"; - - using (var output = new StreamWriter(String.Join(Path.DirectorySeparatorChar, pathSplit))) - { - output.WriteLine(ChromatographicPeak.TabSeparatedHeader); - - foreach (var peak in DoubleCheckedPeaks.SelectMany(p => p.Value) - .OrderBy(p => p.SpectraFileInfo.FilenameWithoutExtension) - .ThenByDescending(p => p.Collision)) - { - output.WriteLine(peak.ToString()); - } - } } if(decoyPath != null & DecoyPeaks.IsNotNullOrEmpty()) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 97e64cfc6..719766364 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -912,38 +912,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } }); - - // List of donor peaks where the peptide WAS identified in the acceptor file and the best donor is a different file. - // This will be used for checking the error rate - // For each sequence, we only select one peak corresponding to the PSM with the lowest q-value - List donorPeaksWithMsms = donorFilePeakListKvp.Value - .Where(p => acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) - && p.SpectraFileInfo != idAcceptorFile - && !p.Identifications.First().IsDecoy ) - .ToList(); - - // Loop through every MSMS id in the donor file - Parallel.ForEach(Partitioner.Create(0, donorPeaksWithMsms.Count), - new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, - (range, loopState) => - { - - for (int i = range.Item1; i < range.Item2; i++) - { - ChromatographicPeak donorPeak = donorPeaksWithMsms[i]; - // TODO: Add a toggle that set rtRange to be maximum width - RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); - if (rtInfo == null) continue; - - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, doubleCheckPeaks, out var bestAcceptor); - if(bestAcceptor == null) - { - doubleCheckPeaks.TryAdd( - key: donorPeak.Identifications.First().ModifiedSequence, - value: null); - } - } - }); } foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) @@ -993,103 +961,11 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } - if (best.Identifications.First().QValue >= 0.01 && !best.DecoyPeptide && !best.RandomRt) - continue; // don't accept MBR peaks based on low q ids, do accept decoys though + //if (best.Identifications.First().QValue >= 0.01 && !best.DecoyPeptide && !best.RandomRt) + // continue; // don't accept MBR peaks based on low q ids, do accept decoys though _results.Peaks[idAcceptorFile].Add(best); } - // repeat the above procedures for the doubleCheckPeaks - foreach (var seqDictionaryKvp in doubleCheckPeaks.Where(kvp => kvp.Value != null)) - { - // Each isotopic envelope is linked to a list of ChromatographicPeaks - // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. - foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) - { - List bestPeaks = new(); - foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) - { - bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); - } - envelopePeakListKvp.Value.Clear(); - envelopePeakListKvp.Value.AddRange(bestPeaks); - } - } - - // take the best result (highest scoring) for each peptide after we've matched from all the donor files - foreach (var mbrIdentifiedPeptide in doubleCheckPeaks.Where(p => acceptorFileIdentifiedSequences.Contains(p.Key))) - { - string peptideModifiedSequence = mbrIdentifiedPeptide.Key; - List msmsPeaks = acceptorFileIdentifiedPeaks - .Where(peak => peak.Apex != null && peak.Identifications.First().ModifiedSequence.Equals(peptideModifiedSequence)).ToList(); - if (!msmsPeaks.Any()) continue; //This shouldn't happen - if (mbrIdentifiedPeptide.Value == null) - { - ChromatographicPeak nullPeak = new ChromatographicPeak(msmsPeaks.First().Identifications.First(), isMbrPeak: false, idAcceptorFile); - nullPeak.Collision = "0"; // Zero represent peak not found - _results.DoubleCheckedPeaks[idAcceptorFile].Add(nullPeak); - continue; - } - - List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); - ChromatographicPeak best = peakHypotheses.First(); - peakHypotheses.Remove(best); - - if (peakHypotheses.Count > 0) - { - double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); - double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); - - List peaksToRemoveFromHypotheses = new List(); - foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) - { - if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) - { - best.MergeFeatureWith(peak, Integrate); - - peaksToRemoveFromHypotheses.Add(peak); - } - } - } - - if(best == null || best.Apex == null) - { - ChromatographicPeak nullPeak = new ChromatographicPeak(msmsPeaks.First().Identifications.First(), isMbrPeak: false, idAcceptorFile); - nullPeak.Collision = "0"; // Zero represent peak not found - _results.DoubleCheckedPeaks[idAcceptorFile].Add(nullPeak); - continue; - } - - if (msmsPeaks.Any(peak => peak.Apex.Equals(best.Apex))) - { - best.Collision = "1"; // One is best possible - } - else - { - var test = msmsPeaks.Where(peak => Math.Abs(peak.Apex.IndexedPeak.RetentionTime - best.Apex.IndexedPeak.RetentionTime) < 0.0001); - if(test.IsNotNullOrEmpty()) - { - best.Collision = "2"; // Assumed same time, different charge state - } - else - { - test = msmsPeaks.Where(peak => - { - var rts = peak.IsotopicEnvelopes.Select(e => e.IndexedPeak.RetentionTime); - return best.ApexRetentionTime >= rts.Minimum() && best.ApexRetentionTime <= rts.Maximum(); - }); - if (test.IsNotNullOrEmpty()) - { - best.Collision = "3"; // Overlap peak - } - else - { - best.Collision = "-1"; - } - } - } - _results.DoubleCheckedPeaks[idAcceptorFile].Add(best); - } - RunErrorChecking(idAcceptorFile); } diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index 13a1e3962..6056ea3c4 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -132,9 +132,6 @@ public static void TwoFileMbrTest() // } //} - var collisionDict = results.DoubleCheckedPeaks.SelectMany(kvp => kvp.Value).GroupBy(peak => peak.Collision) - .ToDictionary(g => g.Key, g => g.Count()); - var f1r1MbrResults = results .PeptideModifiedSequences .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MBR && p.Value.GetDetectionType(j6) == DetectionType.MSMS).ToList(); From e595cd4dc890b0591debd27412e78b04c252af90 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 29 Feb 2024 18:49:17 -0600 Subject: [PATCH 25/55] Add check for MBR/MSMS peak collision --- mzLib/FlashLFQ/FlashLfqEngine.cs | 84 ++++++++++++------------ mzLib/Test/Test.csproj | 5 ++ mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 16 ++--- mzLib/mzLib.nuspec | 2 +- 4 files changed, 57 insertions(+), 50 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 719766364..94a92fad2 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -914,10 +914,13 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) }); } + // Eliminate duplicate peaks (not sure where they come from) foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) { // Each isotopic envelope is linked to a list of ChromatographicPeaks - // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. + // Here, we remove instances where the same envelope is associated with multiple chromatographic peaks but the peaks correspond to the same donor peptide + // I don't know why this happens lol + // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications, then they're kept separate. foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) { List bestPeaks = new(); @@ -930,6 +933,12 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } + // Create a dictionary that stores imsPeak associated with an ms/ms identified peptide + Dictionary> msmsImsPeaks = _results.Peaks[idAcceptorFile].Where(peak => peak.Apex?.IndexedPeak != null) + .Select(peak => peak.Apex.IndexedPeak) + .GroupBy(imsPeak => imsPeak.ZeroBasedMs1ScanIndex) + .ToDictionary(g => g.Key, g => g.ToList()); + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -940,29 +949,46 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); - ChromatographicPeak best = peakHypotheses.First(); peakHypotheses.Remove(best); + // Discard any peaks that are already associated with an ms/ms identified peptide + while(best.Apex?.IndexedPeak != null && msmsImsPeaks.TryGetValue(best.Apex.IndexedPeak.ZeroBasedMs1ScanIndex, out var peakList)) + { + if(peakList.Contains(best.Apex.IndexedPeak)) + { + if(!peakHypotheses.Any()) + { + best = null; + break; + } + best = peakHypotheses.First(); + peakHypotheses.Remove(best); + } + else + { + break; + } + } + if (best == null) continue; + + // merge peaks with different charge states if (peakHypotheses.Count > 0) { double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); - List peaksToRemoveFromHypotheses = new List(); + //List peaksToRemoveFromHypotheses = new List(); foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) { if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) { best.MergeFeatureWith(peak, Integrate); - peaksToRemoveFromHypotheses.Add(peak); + //peaksToRemoveFromHypotheses.Add(peak); } } } - - //if (best.Identifications.First().QValue >= 0.01 && !best.DecoyPeptide && !best.RandomRt) - // continue; // don't accept MBR peaks based on low q ids, do accept decoys though _results.Peaks[idAcceptorFile].Add(best); } @@ -1133,7 +1159,6 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( } - /// /// Checks for and resolves situations where one IndexedMassSpectralPeak is defined as the apex /// for multiple ChromatographicPeaks. In these situations, the two peaks are merged and the merged @@ -1152,7 +1177,7 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) // merge duplicate peaks and handle MBR/MSMS peakfinding conflicts var errorCheckedPeaksGroupedByApex = new Dictionary(); var errorCheckedPeaks = new List(); - List decoyPeptidePeaks = new(); + foreach (ChromatographicPeak tryPeak in _results.Peaks[spectraFile].OrderBy(p => p.IsMbrPeak)) { tryPeak.CalculateIntensityForThisFeature(Integrate); @@ -1172,21 +1197,18 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) IndexedMassSpectralPeak apexImsPeak = tryPeak.Apex.IndexedPeak; if (errorCheckedPeaksGroupedByApex.TryGetValue(apexImsPeak, out ChromatographicPeak storedPeak) && storedPeak != null) { - //if (tryPeak.IsMbrPeak && storedPeak == null) - //{ - // continue; - //} if (!tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { storedPeak.MergeFeatureWith(tryPeak, Integrate); } else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - if(tryPeak.DecoyPeptide) - { - decoyPeptidePeaks.Add(tryPeak); - } - continue; // Default to MSMS peaks over MBR Peaks + // Default to MSMS peaks over MBR Peaks. + // Most of these have already been eliminated + // However, sometimes merging MBR peaks with different charge states reveals that + // The MBR peak conflicts with an MSMS peak + // Removing the peak when this happens is a conservative step. + continue; } else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak) { @@ -1203,36 +1225,16 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) else { errorCheckedPeaksGroupedByApex.Add(apexImsPeak, tryPeak); - - } - } - - foreach(var peak in DecoyPeaks.Where(peak => peak.SpectraFileInfo == spectraFile)) - { - var apexIms = peak.Apex.IndexedPeak; - if(errorCheckedPeaksGroupedByApex.TryGetValue(apexIms, out var collisionPeak)) - { - if(collisionPeak.IsMbrPeak) - { - peak.Collision = "MBR"; - } - else - { - peak.Collision = "MSMS"; - } - } - else - { - peak.Collision = "N/A"; } } errorCheckedPeaks.AddRange(errorCheckedPeaksGroupedByApex.Values.Where(p => p != null)); - //errorCheckedPeaks.AddRange(decoyPeptidePeaks); - + _results.Peaks[spectraFile] = errorCheckedPeaks; } + public int collisionCount = 0; + /// /// Takes in a list of imsPeaks and finds all the isotopic peaks in each scan. If the experimental isotopic distribution /// matches the theoretical distribution, an IsotopicEnvelope object is created from the summed intensities of each isotopic peak. diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index fba08a248..aebc282ad 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -13,10 +13,15 @@ + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs index 6056ea3c4..a136647d6 100644 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs @@ -304,14 +304,14 @@ public static void ThreeFileMbrTest() mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_1PeakPerPepScore.tsv")) - { - writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - foreach (var peak in mbrPeaks) - { - writer.WriteLine(peak); - } - } + //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_1PeakPerPepScore.tsv")) + //{ + // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); + // foreach (var peak in mbrPeaks) + // { + // writer.WriteLine(peak); + // } + //} //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) //{ diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index ae12c509d..ed665b18a 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.0.7 + 5.3.1.0 mzLib mzLib Stef S. From c88e32e8b79b96d9606c5e49367a07076e46afc9 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 6 Mar 2024 18:49:02 -0600 Subject: [PATCH 26/55] Changed peak decoy selection method --- mzLib/FlashLFQ/FlashLfqEngine.cs | 86 +++++++++++++++++++++++++------- mzLib/mzLib.nuspec | 2 +- 2 files changed, 68 insertions(+), 20 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 94a92fad2..b99a2372b 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -41,8 +41,8 @@ public class FlashLfqEngine public readonly double MbrPpmTolerance; // New MBR Settings - public readonly double RtWindowIncrease = 1.0; - public readonly double MbrAlignmentWindow = 0.5; + public readonly double RtWindowIncrease = 0; + public readonly double MbrAlignmentWindow = 2.5; //public readonly double? MbrPpmTolerance; /// /// Specifies how the donor peak for MBR is selected. @@ -83,6 +83,7 @@ public class FlashLfqEngine internal PeakIndexingEngine _peakIndexingEngine; internal Dictionary> DonorFileToPeakDict { get; private set; } internal ConcurrentBag DecoyPeaks { get; private set; } + internal List PeptidesForMbr { get; init; } public FlashLfqEngine( @@ -113,7 +114,8 @@ public FlashLfqEngine( bool pairedSamples = false, int? randomSeed = null, char donorCriterion = 'I', - double donorQValueThreshold = 0.05) + double donorQValueThreshold = 0.05, + List peptidesForMbr = null) { Loaders.LoadElements(); @@ -134,6 +136,10 @@ public FlashLfqEngine( if(MatchBetweenRuns) { DecoyPeaks = new(); + if(peptidesForMbr != null) + { + PeptidesForMbr = peptidesForMbr; + } } MbrPpmTolerance = matchBetweenRunsPpmTolerance; Integrate = integrate; @@ -580,6 +586,20 @@ private void FindPeptideDonorFiles() .GroupBy(peak => peak.Identifications.First().ModifiedSequence) .ToDictionary(group => group.Key, group => group.ToList()); + if(PeptidesForMbr.IsNotNullOrEmpty()) + { + // remove all donor sequences not in PeptidesForMbr + Dictionary> filteredSeqPeakDict = new(); + foreach(string seq in PeptidesForMbr) + { + if(seqPeakDict.TryGetValue(seq, out var value)) + { + filteredSeqPeakDict.Add(seq, value); + } + } + seqPeakDict = filteredSeqPeakDict; + } + // iterate through each unique sequence foreach (var sequencePeakListKvp in seqPeakDict) { @@ -645,9 +665,10 @@ internal RtInfo PredictRetentionTime( ChromatographicPeak donorPeak, SpectraFileInfo acceptorFile, bool acceptorSampleIsFractionated, - bool donorSampleIsFractionated) + bool donorSampleIsFractionated, + out int rtCalCurveIndex) { - + rtCalCurveIndex = -1; var nearbyCalibrationPoints = new List(); // only compare +- 1 fraction @@ -675,6 +696,8 @@ internal RtInfo PredictRetentionTime( index = rtCalibrationCurve.Length - 1; } + rtCalCurveIndex = index; + // gather nearby data points for (int r = index; r < rtCalibrationCurve.Length; r++) { @@ -741,6 +764,17 @@ internal RtInfo PredictRetentionTime( return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); } + // Wrapper for the other PredictRetentionTime + internal RtInfo PredictRetentionTime( + RetentionTimeCalibDataPoint[] rtCalibrationCurve, + ChromatographicPeak donorPeak, + SpectraFileInfo acceptorFile, + bool acceptorSampleIsFractionated, + bool donorSampleIsFractionated) + { + return PredictRetentionTime(rtCalibrationCurve, donorPeak, acceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, out int calCurveindex); + } + /// /// Constructs a MbrScorer object that is used to score all MBR peaks for a given acceptor file /// @@ -880,29 +914,45 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { ChromatographicPeak donorPeak = idDonorPeaks[i]; // TODO: Add a toggle that set rtRange to be maximum width - RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, + out int rtCalCurveIndex); if (rtInfo == null) continue; FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestAcceptor); // Draw a random donor that has an rt sufficiently far enough away - var randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; - int randomPeaksSampled = 0; - double minimumDifference = Math.Min(rtInfo.Width * 1.25, 1); - while(randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence - || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumDifference) // multiply for safety, in case the relative rt shifts after alignment + ChromatographicPeak randomDonor = null; + double minimumDifference = Math.Min(rtInfo.Width * 1.5, 1); + + // when choosing the "random" rt donor peak, we want to go closer to the center of the run + int iterator = rtCalCurveIndex < rtCalibrationCurve.Length / 2 ? 1 : -1; + for(int j = rtCalCurveIndex+iterator; j >=0 && j < rtCalibrationCurve.Length; j += iterator) { - randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; - if (randomPeaksSampled++ > (idDonorPeaks.Count - 1)) + RetentionTimeCalibDataPoint testPoint = rtCalibrationCurve[j]; + if (testPoint.DonorFilePeak != null && testPoint.DonorFilePeak.ApexRetentionTime > 0) { - randomDonor = null; - break; // Prevent infinite loops + double testRetentionTime = testPoint.DonorFilePeak.ApexRetentionTime; + if ( Math.Abs(testRetentionTime - donorPeak.ApexRetentionTime) > minimumDifference ) + { + randomDonor = testPoint.DonorFilePeak; + break; + } } } + //while(randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence + // || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumDifference) // multiply for safety, in case the relative rt shifts after alignment + //{ + // randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; + // if (randomPeaksSampled++ > (idDonorPeaks.Count - 1)) + // { + // randomDonor = null; + // break; // Prevent infinite loops + // } + //} if (randomDonor == null) continue; // Map the random rt onto the new file - RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor,idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (decoyRtInfo == null) continue; // Find a decoy peak using the randomly drawn retention time FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, randomRt:decoyRtInfo.PredictedRt); @@ -1044,7 +1094,7 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - double rtVariance = Math.Min(rtInfo.Width - RtWindowIncrease / 6, 0.05); // Minimum standard deviation of 3 seconds + double rtVariance = Math.Min((rtInfo.Width - RtWindowIncrease) / 6, 0.05); // Minimum standard deviation of 3 seconds Normal rtScoringDistribution = randomRt == null ? new Normal(rtInfo.PredictedRt, rtVariance) : new Normal((double)randomRt, rtVariance); bestAcceptor = null; @@ -1233,8 +1283,6 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) _results.Peaks[spectraFile] = errorCheckedPeaks; } - public int collisionCount = 0; - /// /// Takes in a list of imsPeaks and finds all the isotopic peaks in each scan. If the experimental isotopic distribution /// matches the theoretical distribution, an IsotopicEnvelope object is created from the summed intensities of each isotopic peak. diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index ed665b18a..c32a4526c 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.1.0 + 5.3.2.1 mzLib mzLib Stef S. From b6dbc8c99052f55629725ae1205a0405a9ff6f1c Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 6 Mar 2024 18:56:30 -0600 Subject: [PATCH 27/55] Optional PeptidesForMbr argument added, not covered in previous commit messages. Not actually this commit --- mzLib/mzLib.nuspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index c32a4526c..b4bbbddd5 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.2.1 + 5.3.2.3 mzLib mzLib Stef S. From b12c44cfe50809cbe25643bf86878c1544273e60 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 7 Mar 2024 00:48:09 -0600 Subject: [PATCH 28/55] 5.3.2.4 - Changed randomRT selection method --- mzLib/FlashLFQ/FlashLfqEngine.cs | 36 +++++++++++--------------------- mzLib/mzLib.nuspec | 2 +- 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index b99a2372b..82b63d2c0 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -921,34 +921,22 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestAcceptor); // Draw a random donor that has an rt sufficiently far enough away - ChromatographicPeak randomDonor = null; - double minimumDifference = Math.Min(rtInfo.Width * 1.5, 1); - - // when choosing the "random" rt donor peak, we want to go closer to the center of the run - int iterator = rtCalCurveIndex < rtCalibrationCurve.Length / 2 ? 1 : -1; - for(int j = rtCalCurveIndex+iterator; j >=0 && j < rtCalibrationCurve.Length; j += iterator) + ChromatographicPeak randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; + int randomPeaksSampled = 1; + double minimumDifference = Math.Min(rtInfo.Width * 1.25, 0.5); + + while (randomDonor == null + || randomDonor.Identifications.Any() //probably unneccesary check + || randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence + || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumDifference) // multiply for safety, in case the relative rt shifts after alignment { - RetentionTimeCalibDataPoint testPoint = rtCalibrationCurve[j]; - if (testPoint.DonorFilePeak != null && testPoint.DonorFilePeak.ApexRetentionTime > 0) + randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; + if (randomPeaksSampled++ > (rtCalibrationCurve.Length - 1)) { - double testRetentionTime = testPoint.DonorFilePeak.ApexRetentionTime; - if ( Math.Abs(testRetentionTime - donorPeak.ApexRetentionTime) > minimumDifference ) - { - randomDonor = testPoint.DonorFilePeak; - break; - } + randomDonor = null; + break; // Prevent infinite loops } } - //while(randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence - // || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumDifference) // multiply for safety, in case the relative rt shifts after alignment - //{ - // randomDonor = idDonorPeaks[randomGenerator.Next(idDonorPeaks.Count)]; - // if (randomPeaksSampled++ > (idDonorPeaks.Count - 1)) - // { - // randomDonor = null; - // break; // Prevent infinite loops - // } - //} if (randomDonor == null) continue; // Map the random rt onto the new file diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index b4bbbddd5..d3864119e 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.2.3 + 5.3.2.4 mzLib mzLib Stef S. From 6d96aa7c44c73052f8964908b3a06a244beeb8fa Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 7 Mar 2024 01:09:24 -0600 Subject: [PATCH 29/55] 5.3.2.5 - Minimum rtWindowWidth = 30 seconds --- mzLib/FlashLFQ/FlashLfqEngine.cs | 4 ++-- mzLib/FlashLFQ/RtInfo.cs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 82b63d2c0..0f49894b8 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -1056,8 +1056,8 @@ internal void FindAllAcceptorPeaks( Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; - double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - (rtInfo.Width / 2.0); - double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + (rtInfo.Width / 2.0); + double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - Math.Max((rtInfo.Width / 2.0), 0.25); + double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + Math.Max((rtInfo.Width / 2.0), 0.25); for (int j = 0; j < ms1ScanInfos.Length; j++) { diff --git a/mzLib/FlashLFQ/RtInfo.cs b/mzLib/FlashLFQ/RtInfo.cs index 74c4e7b91..e6c4552ad 100644 --- a/mzLib/FlashLFQ/RtInfo.cs +++ b/mzLib/FlashLFQ/RtInfo.cs @@ -12,8 +12,8 @@ public class RtInfo public double Width { get; } public double? RtSd { get; } public double? RtInterquartileRange { get; } - public double RtStartHypothesis => PredictedRt - (Width / 2.0); - public double RtEndHypothesis => PredictedRt + (Width / 2.0); + public double RtStartHypothesis => PredictedRt - Math.Max((Width / 2.0), 0.25); + public double RtEndHypothesis => PredictedRt + Math.Max((Width / 2.0), 0.25); public RtInfo(double predictedRt, double width, double? rtSd, double? rtInterquartileRange) { From 8bb7cbe9d53b3157c6400c3ad7e12facbca2cea3 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 7 Mar 2024 01:29:50 -0600 Subject: [PATCH 30/55] Actually updated nuget --- mzLib/mzLib.nuspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index d3864119e..aa836e246 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.2.4 + 5.3.2.5 mzLib mzLib Stef S. From 42e922c3cb061e55aba7fbadc31c4255a10e7c52 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 7 Mar 2024 01:39:55 -0600 Subject: [PATCH 31/55] 5.3.2.6 - Fixed decoy search. 2.4 and 2.5 are junk --- mzLib/FlashLFQ/FlashLfqEngine.cs | 1 - mzLib/mzLib.nuspec | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 0f49894b8..cba62b0a0 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -926,7 +926,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) double minimumDifference = Math.Min(rtInfo.Width * 1.25, 0.5); while (randomDonor == null - || randomDonor.Identifications.Any() //probably unneccesary check || randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumDifference) // multiply for safety, in case the relative rt shifts after alignment { diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index aa836e246..e0d7ed903 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.2.5 + 5.3.2.6 mzLib mzLib Stef S. From 12f3f34dac6dcea1e62353802c94de2091aa6f70 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 8 Mar 2024 12:01:00 -0600 Subject: [PATCH 32/55] Changed MBR RT prediction method --- mzLib/FlashLFQ/FlashLfqEngine.cs | 125 +++++++++++++++---------------- mzLib/FlashLFQ/MbrScorer.cs | 64 +++++++--------- 2 files changed, 88 insertions(+), 101 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index cba62b0a0..73d5d43a3 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -512,11 +512,12 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) /// Used by the match-between-runs algorithm to determine systematic retention time drifts between /// chromatographic runs. /// - private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor) + private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor, out Normal rtDifferenceDistribution) { - var donorFileBestMsmsPeaks = new Dictionary(); - var acceptorFileBestMsmsPeaks = new Dictionary(); - var rtCalibrationCurve = new List(); + Dictionary donorFileBestMsmsPeaks = new(); + Dictionary acceptorFileBestMsmsPeaks = new(); + List rtCalibrationCurve = new(); + List anchorPeptideRtDiffs = new(); // get all peaks, not counting ambiguous peaks IEnumerable donorPeaks = _results.Peaks[donor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); @@ -562,9 +563,16 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec if (donorFileBestMsmsPeaks.TryGetValue(peak.Key, out ChromatographicPeak donorFilePeak)) { rtCalibrationCurve.Add(new RetentionTimeCalibDataPoint(donorFilePeak, acceptorFilePeak)); + if(donorFilePeak.ApexRetentionTime > 0 && acceptorFilePeak.ApexRetentionTime > 0) + { + anchorPeptideRtDiffs.Add(donorFilePeak.ApexRetentionTime - acceptorFilePeak.ApexRetentionTime); + } } } + // build rtDiff distribution + rtDifferenceDistribution = new Normal(mean: anchorPeptideRtDiffs.Median(), stddev: anchorPeptideRtDiffs.StandardDeviation()); + return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); } @@ -665,11 +673,11 @@ internal RtInfo PredictRetentionTime( ChromatographicPeak donorPeak, SpectraFileInfo acceptorFile, bool acceptorSampleIsFractionated, - bool donorSampleIsFractionated, - out int rtCalCurveIndex) + bool donorSampleIsFractionated, + MbrScorer scorer) { - rtCalCurveIndex = -1; var nearbyCalibrationPoints = new List(); + int numberOfAnchors = 4; // The number of anchor peptides to be used for local alignment. Must be an even number! // only compare +- 1 fraction if (acceptorSampleIsFractionated && donorSampleIsFractionated) @@ -696,83 +704,73 @@ internal RtInfo PredictRetentionTime( index = rtCalibrationCurve.Length - 1; } - rtCalCurveIndex = index; - + int numberOfForwardAnchors = 0; // gather nearby data points for (int r = index; r < rtCalibrationCurve.Length; r++) { double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; - - if (Math.Abs(rtDiff) < 0.5) + if (rtCalibrationCurve[r].AcceptorFilePeak != null + && rtCalibrationCurve[r].AcceptorFilePeak.ApexRetentionTime > 0) { + if(Math.Abs(rtDiff) < 0.5) // If the rtDiff is too large, it's no longer local alignment + { + break; + } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); - } - else - { - break; + numberOfForwardAnchors++; + if(numberOfForwardAnchors >= numberOfAnchors / 2) // We only want a handful of anchor points + { + break; + } } } + int numberOfBackwardsAnchors = 0; for (int r = index - 1; r >= 0; r--) { double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; - - if (Math.Abs(rtDiff) < 0.5) + if (rtCalibrationCurve[r].AcceptorFilePeak != null + && rtCalibrationCurve[r].AcceptorFilePeak.ApexRetentionTime > 0) { + if (Math.Abs(rtDiff) < 0.5) // If the rtDiff is too large, it's no longer local alignment + { + break; + } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); - } - else - { - break; + numberOfBackwardsAnchors++; + if (numberOfBackwardsAnchors >= numberOfAnchors / 2) // We only want a handful of anchor points + { + break; + } } } + double medianRtDiff; if (!nearbyCalibrationPoints.Any()) { - return null; + medianRtDiff = scorer.GetMedianRtDiff(donorPeak.SpectraFileInfo); + } + else + { + // calculate difference between acceptor and donor RTs for these RT region + List rtDiffs = nearbyCalibrationPoints + .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) + .ToList(); + medianRtDiff = rtDiffs.Median(); } - - // calculate difference between acceptor and donor RTs for these RT region - List rtDiffs = nearbyCalibrationPoints - .Select(p => p.AcceptorFilePeak.Apex.IndexedPeak.RetentionTime - p.DonorFilePeak.Apex.IndexedPeak.RetentionTime) - .ToList(); // figure out the range of RT differences between the files that are "reasonable", centered around the median difference - double median = rtDiffs.Median(); + double rtRange = scorer.GetRTWindowWidth(donorPeak.SpectraFileInfo); // default range (if only 1 datapoint, or SD is 0, range is very high, etc) - double rtRange = MbrRtWindow; + //double rtRange = MbrRtWindow; double? rtStdDev = null; double? rtInterquartileRange = null; - if (nearbyCalibrationPoints.Count < 6 && nearbyCalibrationPoints.Count > 1 && rtDiffs.StandardDeviation() > 0) - { - rtStdDev = rtDiffs.StandardDeviation(); - rtRange = (double)rtStdDev * 6.0; // Multiplication inherited from legacy code, unsure of reason for 6 - } - else if (nearbyCalibrationPoints.Count >= 6 && rtDiffs.InterquartileRange() > 0) - { - rtInterquartileRange = rtDiffs.InterquartileRange(); - rtRange = (double)rtInterquartileRange * 4.5; // Multiplication inherited from legacy code, unsure of reason for 4.5 - } - //TODO: Expand range and see what happens rtRange = Math.Min(rtRange+RtWindowIncrease, MbrRtWindow+RtWindowIncrease); - //rtRange = Math.Min(rtRange, MbrRtWindow); - - return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); - } - - // Wrapper for the other PredictRetentionTime - internal RtInfo PredictRetentionTime( - RetentionTimeCalibDataPoint[] rtCalibrationCurve, - ChromatographicPeak donorPeak, - SpectraFileInfo acceptorFile, - bool acceptorSampleIsFractionated, - bool donorSampleIsFractionated) - { - return PredictRetentionTime(rtCalibrationCurve, donorPeak, acceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, out int calCurveindex); + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); } /// @@ -902,7 +900,8 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } // generate RT calibration curve - RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile); + RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile, out Normal rtDifferenceDistribution); + scorer.AddRtDiffDistribution(donorFilePeakListKvp.Key, rtDifferenceDistribution); // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, idDonorPeaks.Count), @@ -914,8 +913,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { ChromatographicPeak donorPeak = idDonorPeaks[i]; // TODO: Add a toggle that set rtRange to be maximum width - RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, - out int rtCalCurveIndex); + RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); if (rtInfo == null) continue; FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestAcceptor); @@ -939,7 +937,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) if (randomDonor == null) continue; // Map the random rt onto the new file - RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); if (decoyRtInfo == null) continue; // Find a decoy peak using the randomly drawn retention time FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, randomRt:decoyRtInfo.PredictedRt); @@ -1081,8 +1079,6 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - double rtVariance = Math.Min((rtInfo.Width - RtWindowIncrease) / 6, 0.05); // Minimum standard deviation of 3 seconds - Normal rtScoringDistribution = randomRt == null ? new Normal(rtInfo.PredictedRt, rtVariance) : new Normal((double)randomRt, rtVariance); bestAcceptor = null; foreach (int z in chargesToMatch) @@ -1105,7 +1101,7 @@ internal void FindAllAcceptorPeaks( while (chargeEnvelopes.Any()) { ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, - fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes, randomRt: randomRt != null); + fileSpecificTol, rtInfo, z, chargeEnvelopes, randomRt: randomRt != null); if (acceptorPeak == null) continue; if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) @@ -1157,8 +1153,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( MbrScorer scorer, ChromatographicPeak donorPeak, Tolerance mbrTol, - RtInfo rtInfo, - Normal rtScoringDistribution, + RtInfo rtInfo, int z, List chargeEnvelopes, bool randomRt = false) @@ -1188,9 +1183,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return null; } - acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, - rtScoringDistribution, - donorPeak); + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak); return acceptorPeak; } diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index de4ed4be1..73d6699a2 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -13,12 +13,13 @@ namespace FlashLFQ /// internal class MbrScorer { - // Intensity and ppm distribution are specific to each acceptor file + // Intensity and ppm distributions are specific to each acceptor file private readonly Normal _logIntensityDistribution; private readonly Normal _ppmDistribution; private readonly Normal _scanCountDistribution; - // The logFcDistributions are unique to each donor file - acceptor file pair + // The logFcDistributions and rtDifference distributions are unique to each donor file - acceptor file pair private Dictionary _logFcDistributionDictionary; + private Dictionary _rtDifferenceDistributionDictionary; // Donor file rt - Acceptor File rt internal Dictionary ApexToAcceptorFilePeakDict { get; } internal List UnambiguousMsMsAcceptorPeaks { get; } @@ -41,23 +42,46 @@ internal MbrScorer( _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; _logFcDistributionDictionary = new(); + _rtDifferenceDistributionDictionary = new(); // This is kludgey, because scan counts are discrete List scanList = acceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); // build a normal distribution for the scan list of the acceptor peaks _scanCountDistribution = new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36); } + internal void AddRtDiffDistribution(SpectraFileInfo donorFile, Normal rtDiffDistribution) + { + _rtDifferenceDistributionDictionary.Add(donorFile, rtDiffDistribution); + } + + /// + /// Get the RT window width for a given donor file, + /// where RT window width is equal to 6*stdDev of the rtDiffs for all anchor peptides + /// + /// The width of the retention time window in minutes + internal double GetRTWindowWidth(SpectraFileInfo donorFile) + { + // 99.7% of all peaks are expected to fall within six standard deviations + return _rtDifferenceDistributionDictionary[donorFile].StdDev * 6; + } + + internal double GetMedianRtDiff(SpectraFileInfo donorFile) + { + return _rtDifferenceDistributionDictionary[donorFile].Median; + } + /// /// Scores a MBR peak based on it's retention time, ppm error, and intensity /// /// An MBR Score ranging between 0 and 100. Higher scores are better. - internal double ScoreMbr(ChromatographicPeak acceptorPeak, Normal rtDistribution, ChromatographicPeak? donorPeak = null) + internal double ScoreMbr(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak) { acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); - acceptorPeak.RtScore = CalculateScore(rtDistribution, acceptorPeak.ApexRetentionTime); + acceptorPeak.RtScore = CalculateScore( + _rtDifferenceDistributionDictionary[donorPeak.SpectraFileInfo], + donorPeak.ApexRetentionTime - acceptorPeak.ApexRetentionTime); acceptorPeak.PpmScore = CalculateScore(_ppmDistribution, acceptorPeak.MassError); acceptorPeak.ScanCountScore = CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); - //acceptorPeak.ScanCountScore = (double)acceptorPeak.ScanCount / _scanCountDistribution.Median; double donorIdPEP = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First().PosteriorErrorProbability; @@ -71,9 +95,6 @@ internal double CalculateScore(Normal distribution, double value) double absoluteDiffFromMean = Math.Abs(distribution.Mean - value); // Returns a value between (0, 1] where 1 means the value was equal to the distribution mean return 2 * distribution.CumulativeDistribution(distribution.Mean - absoluteDiffFromMean); - - // old method - //return DensityScoreConversion(distribution.Density(value)); } internal double CalculateIntensityScore(double acceptorIntensity, ChromatographicPeak donorPeak) @@ -102,21 +123,6 @@ internal double CalculateIntensityScore(double acceptorIntensity, Chromatographi } - internal double CalculatePpmScore(double ppmError) - { - return DensityScoreConversion(_ppmDistribution.Density(ppmError)); - } - - internal double CalculateRtScore(double retentionTime, Normal rtDistribution) - { - return DensityScoreConversion(rtDistribution.Density(retentionTime)); - } - - internal double CalculateScanCountScore(int scanCount) - { - return (double)scanCount / (double)MaxNumberOfScansObserved; - } - /// /// Find the difference in peptide intensities between donor and acceptor files /// this intensity score creates a conservative bias in MBR @@ -170,18 +176,6 @@ internal void CalculateFoldChangeBetweenFiles(List idDonorP _logFcDistributionDictionary.Add(idDonorPeaks.First().SpectraFileInfo, foldChangeDistribution); } } - - /// - /// Takes in the density of a normal distribution at a given point, and transforms it - /// by taking the log of the density plus the square root of the squared density plus one - /// This transformation was implemented in the original code, and we're unsure of the rationale - /// - /// A Normal distribution - /// The transformed score - private double DensityScoreConversion(double density) - { - return Math.Log(density + Math.Sqrt(Math.Pow(density, 2) + 1)); - } } } From 175e0a7813cdf631bb5adf16422f1c6f90a01771 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 8 Mar 2024 12:14:38 -0600 Subject: [PATCH 33/55] Fixed bug in rtPrediction --- mzLib/FlashLFQ/FlashLfqEngine.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 73d5d43a3..90ff9e197 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -706,13 +706,13 @@ internal RtInfo PredictRetentionTime( int numberOfForwardAnchors = 0; // gather nearby data points - for (int r = index; r < rtCalibrationCurve.Length; r++) + for (int r = index+1; r < rtCalibrationCurve.Length; r++) { double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; if (rtCalibrationCurve[r].AcceptorFilePeak != null && rtCalibrationCurve[r].AcceptorFilePeak.ApexRetentionTime > 0) { - if(Math.Abs(rtDiff) < 0.5) // If the rtDiff is too large, it's no longer local alignment + if(Math.Abs(rtDiff) > 0.5) // If the rtDiff is too large, it's no longer local alignment { break; } @@ -732,7 +732,7 @@ internal RtInfo PredictRetentionTime( if (rtCalibrationCurve[r].AcceptorFilePeak != null && rtCalibrationCurve[r].AcceptorFilePeak.ApexRetentionTime > 0) { - if (Math.Abs(rtDiff) < 0.5) // If the rtDiff is too large, it's no longer local alignment + if (Math.Abs(rtDiff) > 0.5) // If the rtDiff is too large, it's no longer local alignment { break; } @@ -1053,8 +1053,8 @@ internal void FindAllAcceptorPeaks( Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; - double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - Math.Max((rtInfo.Width / 2.0), 0.25); - double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + Math.Max((rtInfo.Width / 2.0), 0.25); + double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - (rtInfo.Width / 2.0); + double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + (rtInfo.Width / 2.0); for (int j = 0; j < ms1ScanInfos.Length; j++) { From bcf8d7cdf8ff2b4b5d486fcc398514a09c1619f2 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 8 Mar 2024 18:01:41 -0600 Subject: [PATCH 34/55] Changed decoy search method and fixed issues with decoy rt scoring --- mzLib/FlashLFQ/FlashLfqEngine.cs | 45 ++++++++++++++++++-------------- mzLib/FlashLFQ/MbrScorer.cs | 18 ++++++++----- mzLib/mzLib.nuspec | 2 +- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 90ff9e197..8b6f25a0d 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -100,7 +100,7 @@ public FlashLfqEngine( // MBR settings bool matchBetweenRuns = false, - double matchBetweenRunsPpmTolerance = 10.0, + double matchBetweenRunsPpmTolerance = 5.0, double maxMbrWindow = 2.5, bool requireMsmsIdInCondition = false, @@ -512,13 +512,14 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) /// Used by the match-between-runs algorithm to determine systematic retention time drifts between /// chromatographic runs. /// - private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor, out Normal rtDifferenceDistribution) - { + private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor, MbrScorer scorer) + { Dictionary donorFileBestMsmsPeaks = new(); Dictionary acceptorFileBestMsmsPeaks = new(); List rtCalibrationCurve = new(); List anchorPeptideRtDiffs = new(); + // get all peaks, not counting ambiguous peaks IEnumerable donorPeaks = _results.Peaks[donor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); IEnumerable acceptorPeaks = _results.Peaks[acceptor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); @@ -570,8 +571,10 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec } } + // build rtDiff distribution - rtDifferenceDistribution = new Normal(mean: anchorPeptideRtDiffs.Median(), stddev: anchorPeptideRtDiffs.StandardDeviation()); + var rtDifferenceDistribution = new Normal(mean: anchorPeptideRtDiffs.Median(), stddev: anchorPeptideRtDiffs.StandardDeviation()); + scorer.AddRtDiffDistribution(donor, rtDifferenceDistribution); return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); } @@ -746,9 +749,12 @@ internal RtInfo PredictRetentionTime( } double medianRtDiff; + double rtRange; if (!nearbyCalibrationPoints.Any()) { + // Default rt adjustments medianRtDiff = scorer.GetMedianRtDiff(donorPeak.SpectraFileInfo); + rtRange = scorer.GetRTWindowWidth(donorPeak.SpectraFileInfo); } else { @@ -756,14 +762,11 @@ internal RtInfo PredictRetentionTime( List rtDiffs = nearbyCalibrationPoints .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) .ToList(); + medianRtDiff = rtDiffs.Median(); + rtRange = rtDiffs.InterquartileRange() * 3; // This is roughly equivalent to 2 standard deviations } - - // figure out the range of RT differences between the files that are "reasonable", centered around the median difference - double rtRange = scorer.GetRTWindowWidth(donorPeak.SpectraFileInfo); - - // default range (if only 1 datapoint, or SD is 0, range is very high, etc) - //double rtRange = MbrRtWindow; + double? rtStdDev = null; double? rtInterquartileRange = null; @@ -900,8 +903,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } // generate RT calibration curve - RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile, out Normal rtDifferenceDistribution); - scorer.AddRtDiffDistribution(donorFilePeakListKvp.Key, rtDifferenceDistribution); + RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile, scorer); // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, idDonorPeaks.Count), @@ -921,13 +923,18 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // Draw a random donor that has an rt sufficiently far enough away ChromatographicPeak randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; int randomPeaksSampled = 1; - double minimumDifference = Math.Min(rtInfo.Width * 1.25, 0.5); + // multiply for safety, in case the relative rt shifts after alignment + double minimumRtDifference = Math.Min(rtInfo.Width * 1.5, 0.5); + double massDiff = Math.Abs(randomDonor.Identifications.First().PeakfindingMass - donorPeak.Identifications.First().PeakfindingMass); - while (randomDonor == null + while (randomDonor == null + || massDiff < 0.1 + || massDiff > 50 // Need the random one to be relatively close in mass (but not too close!) || randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence - || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumDifference) // multiply for safety, in case the relative rt shifts after alignment + || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumRtDifference) { randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; + massDiff = Math.Abs(randomDonor.Identifications.First().PeakfindingMass - donorPeak.Identifications.First().PeakfindingMass); if (randomPeaksSampled++ > (rtCalibrationCurve.Length - 1)) { randomDonor = null; @@ -1101,7 +1108,7 @@ internal void FindAllAcceptorPeaks( while (chargeEnvelopes.Any()) { ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, - fileSpecificTol, rtInfo, z, chargeEnvelopes, randomRt: randomRt != null); + fileSpecificTol, rtInfo, z, chargeEnvelopes, randomRt); if (acceptorPeak == null) continue; if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) @@ -1156,10 +1163,10 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( RtInfo rtInfo, int z, List chargeEnvelopes, - bool randomRt = false) + double? randomRt = null) { var donorId = donorPeak.Identifications.OrderBy(p => p.QValue).First(); - var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, randomRt); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, randomRt != null); // Grab the first scan/envelope from charge envelopes. This should be the most intense envelope in the list IsotopicEnvelope seedEnv = chargeEnvelopes.First(); @@ -1183,7 +1190,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return null; } - acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak); + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, randomRt ?? rtInfo.PredictedRt); return acceptorPeak; } diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 73d6699a2..47072ea67 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -26,6 +26,8 @@ internal class MbrScorer internal double MaxNumberOfScansObserved { get; } + internal readonly Normal _rtErrorDistribution; + /// /// Takes in an intensity distribution, a log foldchange distribution, and a ppm distribution /// unique to each donor file - acceptor file pair. These are used to score MBR matches @@ -41,8 +43,10 @@ internal MbrScorer( MaxNumberOfScansObserved = acceptorPeaks.Max(peak => peak.ScanCount); _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; + _rtErrorDistribution = new Normal(mean: 0, stddev: 0.1); // in minutes _logFcDistributionDictionary = new(); _rtDifferenceDistributionDictionary = new(); + // This is kludgey, because scan counts are discrete List scanList = acceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); // build a normal distribution for the scan list of the acceptor peaks @@ -56,13 +60,13 @@ internal void AddRtDiffDistribution(SpectraFileInfo donorFile, Normal rtDiffDist /// /// Get the RT window width for a given donor file, - /// where RT window width is equal to 6*stdDev of the rtDiffs for all anchor peptides + /// where RT window width is equal to 4*stdDev of the rtDiffs for all anchor peptides /// /// The width of the retention time window in minutes internal double GetRTWindowWidth(SpectraFileInfo donorFile) { - // 99.7% of all peaks are expected to fall within six standard deviations - return _rtDifferenceDistributionDictionary[donorFile].StdDev * 6; + // 95% of all peaks are expected to fall within six standard deviations + return _rtDifferenceDistributionDictionary[donorFile].StdDev * 4; } internal double GetMedianRtDiff(SpectraFileInfo donorFile) @@ -70,16 +74,16 @@ internal double GetMedianRtDiff(SpectraFileInfo donorFile) return _rtDifferenceDistributionDictionary[donorFile].Median; } + + /// /// Scores a MBR peak based on it's retention time, ppm error, and intensity /// /// An MBR Score ranging between 0 and 100. Higher scores are better. - internal double ScoreMbr(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak) + internal double ScoreMbr(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak, double predictedRt) { acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); - acceptorPeak.RtScore = CalculateScore( - _rtDifferenceDistributionDictionary[donorPeak.SpectraFileInfo], - donorPeak.ApexRetentionTime - acceptorPeak.ApexRetentionTime); + acceptorPeak.RtScore = CalculateScore(_rtErrorDistribution, acceptorPeak.ApexRetentionTime - predictedRt); acceptorPeak.PpmScore = CalculateScore(_ppmDistribution, acceptorPeak.MassError); acceptorPeak.ScanCountScore = CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index e0d7ed903..d19981eb2 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.2.6 + 5.3.2.12 mzLib mzLib Stef S. From 768b9218539a769fa9f504ddb2ddaa862a445db6 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 8 Mar 2024 23:30:16 -0600 Subject: [PATCH 35/55] small changes to double check procedure --- mzLib/FlashLFQ/FlashLfqEngine.cs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 97e64cfc6..ba5e55e18 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -806,6 +806,14 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); + // Find the 1000 highest scoring psms in the acceptor file, + HashSet bestAcceptorSequences = acceptorFileIdentifiedPeaks + .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) + .OrderByDescending(peak => peak.Identifications.First().PsmScore) + .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence)) + .Take(1000) + .ToHashSet(); + MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol); if (scorer == null) return; @@ -875,7 +883,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (range, loopState) => { - for (int i = range.Item1; i < range.Item2; i++) { ChromatographicPeak donorPeak = idDonorPeaks[i]; @@ -913,13 +920,19 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } }); + HashSet acceptorFileTopMsmsSeqs = rtCalibrationCurve.Where(point => point.DonorFilePeak != null && point.AcceptorFilePeak != null) + .OrderByDescending(p => p.AcceptorFilePeak.Identifications.First().PsmScore) + .Select(p => p.AcceptorFilePeak.Identifications.First().ModifiedSequence) + .Take(1000) + .ToHashSet(); + // List of donor peaks where the peptide WAS identified in the acceptor file and the best donor is a different file. // This will be used for checking the error rate // For each sequence, we only select one peak corresponding to the PSM with the lowest q-value List donorPeaksWithMsms = donorFilePeakListKvp.Value - .Where(p => acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) + .Where(p => acceptorFileTopMsmsSeqs.Contains(p.Identifications.First().ModifiedSequence) && p.SpectraFileInfo != idAcceptorFile - && !p.Identifications.First().IsDecoy ) + && !p.Identifications.First().IsDecoy) .ToList(); // Loop through every MSMS id in the donor file @@ -936,6 +949,8 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) if (rtInfo == null) continue; FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, doubleCheckPeaks, out var bestAcceptor); + + // Then look for a peak decoy if(bestAcceptor == null) { doubleCheckPeaks.TryAdd( From 9d13eb2e58387346cf46b8b9402a8f82a363458b Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 8 Mar 2024 23:49:05 -0600 Subject: [PATCH 36/55] Finished Merging in MbrFdr - mzLib 5341 --- mzLib/FlashLFQ/FlashLFQResults.cs | 20 ++++- mzLib/FlashLFQ/FlashLfqEngine.cs | 141 ++++++++++++++++++++++++++---- 2 files changed, 145 insertions(+), 16 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index c6218bc81..4e8a11bfb 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -14,6 +14,7 @@ public class FlashLfqResults public readonly Dictionary PeptideModifiedSequences; public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; + public readonly Dictionary> DoubleCheckPeaks; public IEnumerable DecoyPeaks { get; set; } public FlashLfqResults(List spectraFiles, List identifications) @@ -22,10 +23,12 @@ public FlashLfqResults(List spectraFiles, List PeptideModifiedSequences = new Dictionary(); ProteinGroups = new Dictionary(); Peaks = new Dictionary>(); + DoubleCheckPeaks = new Dictionary>(); foreach (SpectraFileInfo file in spectraFiles) { Peaks.Add(file, new List()); + DoubleCheckPeaks.Add(file, new List()); } foreach (Identification id in identifications) @@ -563,7 +566,22 @@ public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, st } } - if(decoyPath != null & DecoyPeaks.IsNotNullOrEmpty()) + string[] pathSplit = peaksOutputPath.Split(Path.DirectorySeparatorChar); + pathSplit[^1] = "DoubleCheckedPeaks.tsv"; + + using (var output = new StreamWriter(String.Join(Path.DirectorySeparatorChar, pathSplit))) + { + output.WriteLine(ChromatographicPeak.TabSeparatedHeader); + + foreach (var peak in DoubleCheckPeaks.SelectMany(p => p.Value) + .OrderBy(p => p.SpectraFileInfo.FilenameWithoutExtension) + .ThenByDescending(p => p.Collision)) + { + output.WriteLine(peak.ToString()); + } + } + + if (decoyPath != null & DecoyPeaks.IsNotNullOrEmpty()) { using (StreamWriter output = new StreamWriter(decoyPath)) { diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 810c7d852..9c4359934 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -826,7 +826,6 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie /// private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { - bool acceptorSampleIsFractionated = _results.SpectraFiles .Where(p => p.Condition == idAcceptorFile.Condition && p.BiologicalReplicate == idAcceptorFile.BiologicalReplicate) .Select(p => p.Fraction) @@ -841,14 +840,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); - // Find the 1000 highest scoring psms in the acceptor file, - HashSet bestAcceptorSequences = acceptorFileIdentifiedPeaks - .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) - .OrderByDescending(peak => peak.Identifications.First().PsmScore) - .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence)) - .Take(1000) - .ToHashSet(); - MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol); if (scorer == null) return; @@ -987,13 +978,43 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { ChromatographicPeak donorPeak = donorPeaksWithMsms[i]; // TODO: Add a toggle that set rtRange to be maximum width - RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); if (rtInfo == null) continue; FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, doubleCheckPeaks, out var bestAcceptor); - + // Then look for a peak decoy - if(bestAcceptor == null) + // Draw a random donor that has an rt sufficiently far enough away + ChromatographicPeak randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; + int randomPeaksSampled = 1; + // multiply for safety, in case the relative rt shifts after alignment + double minimumRtDifference = Math.Min(rtInfo.Width * 1.5, 0.5); + double massDiff = Math.Abs(randomDonor.Identifications.First().PeakfindingMass - donorPeak.Identifications.First().PeakfindingMass); + + while (randomDonor == null + || massDiff < 0.1 + || massDiff > 50 // Need the random one to be relatively close in mass (but not too close!) + || randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence + || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumRtDifference) + { + randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; + massDiff = Math.Abs(randomDonor.Identifications.First().PeakfindingMass - donorPeak.Identifications.First().PeakfindingMass); + if (randomPeaksSampled++ > (rtCalibrationCurve.Length - 1)) + { + randomDonor = null; + break; // Prevent infinite loops + } + } + if (randomDonor == null) continue; + + // Map the random rt onto the new file + RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); + if (decoyRtInfo == null) continue; + // Find a decoy peak using the randomly drawn retention time + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, randomRt: decoyRtInfo.PredictedRt); + + // store occasions where no peak was found. This allows us to calculate sensitivity + if (bestAcceptor == null && bestDecoy == null) { doubleCheckPeaks.TryAdd( key: donorPeak.Identifications.First().ModifiedSequence, @@ -1067,20 +1088,110 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); - //List peaksToRemoveFromHypotheses = new List(); foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) { if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) { best.MergeFeatureWith(peak, Integrate); - - //peaksToRemoveFromHypotheses.Add(peak); } } } _results.Peaks[idAcceptorFile].Add(best); } + // repeat basically the same procedure for the double-check peaks + foreach (var seqDictionaryKvp in doubleCheckPeaks.Where(kvp => kvp.Value != null)) + { + // Each isotopic envelope is linked to a list of ChromatographicPeaks + // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. + foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) + { + List bestPeaks = new(); + foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) + { + bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); + } + envelopePeakListKvp.Value.Clear(); + envelopePeakListKvp.Value.AddRange(bestPeaks); + } + } + + // take the best result (highest scoring) for each peptide after we've matched from all the donor files + foreach (var mbrIdentifiedPeptide in doubleCheckPeaks.Where(p => acceptorFileIdentifiedSequences.Contains(p.Key))) + { + string peptideModifiedSequence = mbrIdentifiedPeptide.Key; + List msmsPeaks = acceptorFileIdentifiedPeaks + .Where(peak => peak.Apex != null && peak.Identifications.First().ModifiedSequence.Equals(peptideModifiedSequence)).ToList(); + if (!msmsPeaks.Any()) continue; //This shouldn't happen + if (mbrIdentifiedPeptide.Value == null) + { + ChromatographicPeak nullPeak = new ChromatographicPeak(msmsPeaks.First().Identifications.First(), isMbrPeak: false, idAcceptorFile); + nullPeak.Collision = "0"; // Zero represent peak not found + _results.DoubleCheckPeaks[idAcceptorFile].Add(nullPeak); + continue; + } + + List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); + ChromatographicPeak best = peakHypotheses.FirstOrDefault(); + peakHypotheses.Remove(best); + + if (peakHypotheses.Count > 0) + { + double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); + double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); + + List peaksToRemoveFromHypotheses = new List(); + foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) + { + if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) + { + best.MergeFeatureWith(peak, Integrate); + + peaksToRemoveFromHypotheses.Add(peak); + } + } + } + + if (best == null || best.Apex == null) + { + ChromatographicPeak nullPeak = new ChromatographicPeak(msmsPeaks.First().Identifications.First(), isMbrPeak: false, idAcceptorFile); + nullPeak.Collision = "0"; // Zero represent peak not found + _results.DoubleCheckPeaks[idAcceptorFile].Add(nullPeak); + continue; + } + + if (msmsPeaks.Any(peak => peak.Apex.Equals(best.Apex))) + { + best.Collision = "1"; // One is best possible + } + else + { + var test = msmsPeaks.Where(peak => Math.Abs(peak.Apex.IndexedPeak.RetentionTime - best.Apex.IndexedPeak.RetentionTime) < 0.0001); + if (test.IsNotNullOrEmpty()) + { + best.Collision = "2"; // Assumed same time, different charge state + } + else + { + test = msmsPeaks.Where(peak => + { + var rts = peak.IsotopicEnvelopes.Select(e => e.IndexedPeak.RetentionTime); + return best.ApexRetentionTime >= rts.Minimum() && best.ApexRetentionTime <= rts.Maximum(); + }); + if (test.IsNotNullOrEmpty()) + { + best.Collision = "3"; // Overlap peak + } + else + { + best.Collision = "-1"; + } + } + } + _results.DoubleCheckPeaks[idAcceptorFile].Add(best); + } + + RunErrorChecking(idAcceptorFile); } From b147e3b184b055a5b7f03c8325242b985bbe05f1 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 8 Mar 2024 23:57:36 -0600 Subject: [PATCH 37/55] minor --- mzLib/FlashLFQ/FlashLfqEngine.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 9c4359934..d5f0731d0 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -956,7 +956,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) HashSet acceptorFileTopMsmsSeqs = rtCalibrationCurve.Where(point => point.DonorFilePeak != null && point.AcceptorFilePeak != null) .OrderByDescending(p => p.AcceptorFilePeak.Identifications.First().PsmScore) .Select(p => p.AcceptorFilePeak.Identifications.First().ModifiedSequence) - .Take(1000) + .Take(1500) .ToHashSet(); // List of donor peaks where the peptide WAS identified in the acceptor file and the best donor is a different file. From 01482fc7a43138cc3a680494a72774dbc51faeb3 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 9 Mar 2024 00:12:51 -0600 Subject: [PATCH 38/55] Actually changed nuspec --- mzLib/mzLib.nuspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index d19981eb2..3a8bd4f21 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.2.12 + 5.3.4.1 mzLib mzLib Stef S. From 5ee1d21884eb6ccd37f95025bc1b053246b61d75 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 9 Mar 2024 00:54:59 -0600 Subject: [PATCH 39/55] Increased Rt Range --- mzLib/FlashLFQ/FlashLfqEngine.cs | 2 +- mzLib/mzLib.nuspec | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index d5f0731d0..6c1fbea01 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -764,7 +764,7 @@ internal RtInfo PredictRetentionTime( .ToList(); medianRtDiff = rtDiffs.Median(); - rtRange = rtDiffs.InterquartileRange() * 3; // This is roughly equivalent to 2 standard deviations + rtRange = rtDiffs.InterquartileRange() * 6; // This is roughly equivalent to 2 standard deviations } double? rtStdDev = null; diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 3a8bd4f21..1319cc331 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.4.1 + 5.3.4.2 mzLib mzLib Stef S. From 7e326259eb9a5a666243ffcfead44cb9c9697389 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 9 Mar 2024 01:11:01 -0600 Subject: [PATCH 40/55] Increased doublecheck count to 2500. mzLib 5343 --- mzLib/FlashLFQ/FlashLfqEngine.cs | 3 ++- mzLib/mzLib.nuspec | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 6c1fbea01..1952230c7 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -956,7 +956,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) HashSet acceptorFileTopMsmsSeqs = rtCalibrationCurve.Where(point => point.DonorFilePeak != null && point.AcceptorFilePeak != null) .OrderByDescending(p => p.AcceptorFilePeak.Identifications.First().PsmScore) .Select(p => p.AcceptorFilePeak.Identifications.First().ModifiedSequence) - .Take(1500) + .Take(10000) .ToHashSet(); // List of donor peaks where the peptide WAS identified in the acceptor file and the best donor is a different file. @@ -966,6 +966,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) .Where(p => acceptorFileTopMsmsSeqs.Contains(p.Identifications.First().ModifiedSequence) && p.SpectraFileInfo != idAcceptorFile && !p.Identifications.First().IsDecoy) + .Take(2500) .ToList(); // Loop through every MSMS id in the donor file diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 1319cc331..b30fa090a 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.4.2 + 5.3.4.3 mzLib mzLib Stef S. From 3ceb44776ea3c2f349e94d9696667556e98c8439 Mon Sep 17 00:00:00 2001 From: Alex Date: Sat, 9 Mar 2024 11:01:51 -0600 Subject: [PATCH 41/55] reduced window slightly. mzLib 5344 --- mzLib/FlashLFQ/FlashLfqEngine.cs | 23 +++++++++++------------ mzLib/mzLib.nuspec | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 1952230c7..f8641714d 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -753,19 +753,19 @@ internal RtInfo PredictRetentionTime( if (!nearbyCalibrationPoints.Any()) { // Default rt adjustments - medianRtDiff = scorer.GetMedianRtDiff(donorPeak.SpectraFileInfo); - rtRange = scorer.GetRTWindowWidth(donorPeak.SpectraFileInfo); + return null; + //medianRtDiff = scorer.GetMedianRtDiff(donorPeak.SpectraFileInfo); + //rtRange = scorer.GetRTWindowWidth(donorPeak.SpectraFileInfo); } - else - { - // calculate difference between acceptor and donor RTs for these RT region - List rtDiffs = nearbyCalibrationPoints - .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) - .ToList(); + + // calculate difference between acceptor and donor RTs for these RT region + List rtDiffs = nearbyCalibrationPoints + .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) + .ToList(); - medianRtDiff = rtDiffs.Median(); - rtRange = rtDiffs.InterquartileRange() * 6; // This is roughly equivalent to 2 standard deviations - } + medianRtDiff = rtDiffs.Median(); + rtRange = rtDiffs.InterquartileRange() * 4.5; // This is roughly equivalent to 2 standard deviations + double? rtStdDev = null; double? rtInterquartileRange = null; @@ -954,7 +954,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) }); HashSet acceptorFileTopMsmsSeqs = rtCalibrationCurve.Where(point => point.DonorFilePeak != null && point.AcceptorFilePeak != null) - .OrderByDescending(p => p.AcceptorFilePeak.Identifications.First().PsmScore) .Select(p => p.AcceptorFilePeak.Identifications.First().ModifiedSequence) .Take(10000) .ToHashSet(); diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index b30fa090a..367859f81 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -1,7 +1,7 @@ - 5.3.4.3 + 5.3.4.4 mzLib mzLib Stef S. From 1d5422f6b579f940a5ba377e62543088f3a820a8 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 18 Apr 2024 11:31:30 -0500 Subject: [PATCH 42/55] amended rt scoring distribution in mbr scorer --- mzLib/FlashLFQ/FlashLfqEngine.cs | 9 +++--- mzLib/FlashLFQ/MbrScorer.cs | 55 +++++++++++++++++++++++--------- 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index f8641714d..45111f342 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -517,7 +517,7 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec Dictionary donorFileBestMsmsPeaks = new(); Dictionary acceptorFileBestMsmsPeaks = new(); List rtCalibrationCurve = new(); - List anchorPeptideRtDiffs = new(); + List anchorPeptideRtDiffs = new(); // anchor peptides are peptides that were MS2 detected in both the donor and acceptor runs // get all peaks, not counting ambiguous peaks @@ -571,10 +571,9 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec } } - // build rtDiff distribution - var rtDifferenceDistribution = new Normal(mean: anchorPeptideRtDiffs.Median(), stddev: anchorPeptideRtDiffs.StandardDeviation()); - scorer.AddRtDiffDistribution(donor, rtDifferenceDistribution); + //var rtDifferenceDistribution = new Normal(mean: anchorPeptideRtDiffs.Median(), stddev: anchorPeptideRtDiffs.StandardDeviation()); + scorer.AddRtPredErrorDistribution(donor, anchorPeptideRtDiffs); return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); } @@ -968,6 +967,8 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) .Take(2500) .ToList(); + if (donorPeaksWithMsms.Count < 1) continue; + // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, donorPeaksWithMsms.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 47072ea67..261023c0a 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; using System.Data; +using System.Data.Entity.ModelConfiguration.Conventions; using System.Linq; namespace FlashLFQ @@ -19,15 +20,12 @@ internal class MbrScorer private readonly Normal _scanCountDistribution; // The logFcDistributions and rtDifference distributions are unique to each donor file - acceptor file pair private Dictionary _logFcDistributionDictionary; - private Dictionary _rtDifferenceDistributionDictionary; // Donor file rt - Acceptor File rt + private Dictionary _rtPredictionErrorDistributionDictionary; internal Dictionary ApexToAcceptorFilePeakDict { get; } internal List UnambiguousMsMsAcceptorPeaks { get; } internal double MaxNumberOfScansObserved { get; } - - internal readonly Normal _rtErrorDistribution; - /// /// Takes in an intensity distribution, a log foldchange distribution, and a ppm distribution /// unique to each donor file - acceptor file pair. These are used to score MBR matches @@ -43,9 +41,8 @@ internal MbrScorer( MaxNumberOfScansObserved = acceptorPeaks.Max(peak => peak.ScanCount); _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; - _rtErrorDistribution = new Normal(mean: 0, stddev: 0.1); // in minutes _logFcDistributionDictionary = new(); - _rtDifferenceDistributionDictionary = new(); + _rtPredictionErrorDistributionDictionary = new(); // This is kludgey, because scan counts are discrete List scanList = acceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); @@ -53,9 +50,40 @@ internal MbrScorer( _scanCountDistribution = new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36); } - internal void AddRtDiffDistribution(SpectraFileInfo donorFile, Normal rtDiffDistribution) + /// + /// Takes in a list of retention time differences for anchor peptides (donor RT - acceptor RT) and uses + /// this list to calculate the distribution of prediction errors of the local RT alignment strategy employed by + /// match-between-runs for the specified donor file + /// + /// List of retention time differences (doubles) calculated as donor file RT - acceptor file RT + internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List anchorPeptideRtDiffs) { - _rtDifferenceDistributionDictionary.Add(donorFile, rtDiffDistribution); + // in MBR, we use anchor peptides on either side of the donor to predict the retention time + // here, we're going to repeat the same process, using neighboring anchor peptides to predicte the Rt shift for each + // individual anchor peptide + // then, we'll check how close our predicted rt shift was to the observed rt shift + // and build a distribution based on the predicted v actual rt diffs + + int numAnchorPepsPerSide = 2; // hardCoded for now, number of anchor peptides on each side of the "donor" to be considered + double cumSumRtDiffs; + List rtPredictionErrors = new(); + + for (int i = numAnchorPepsPerSide; i < anchorPeptideRtDiffs.Count - (numAnchorPepsPerSide); i++) + { + cumSumRtDiffs = 0; + for(int j = 1; j <= numAnchorPepsPerSide; j++) + { + cumSumRtDiffs += anchorPeptideRtDiffs[i - j]; + cumSumRtDiffs += anchorPeptideRtDiffs[i + j]; + } + double avgDiff = cumSumRtDiffs / (2 * numAnchorPepsPerSide); + rtPredictionErrors.Add(avgDiff - anchorPeptideRtDiffs[i]); + } + + double medianRtError = rtPredictionErrors.Median(); + double stdDevRtError = rtPredictionErrors.StandardDeviation(); + + _rtPredictionErrorDistributionDictionary.Add(donorFile, new Normal(medianRtError, stdDevRtError)); } /// @@ -66,16 +94,14 @@ internal void AddRtDiffDistribution(SpectraFileInfo donorFile, Normal rtDiffDist internal double GetRTWindowWidth(SpectraFileInfo donorFile) { // 95% of all peaks are expected to fall within six standard deviations - return _rtDifferenceDistributionDictionary[donorFile].StdDev * 4; + return _rtPredictionErrorDistributionDictionary[donorFile].StdDev * 4; } internal double GetMedianRtDiff(SpectraFileInfo donorFile) { - return _rtDifferenceDistributionDictionary[donorFile].Median; + return _rtPredictionErrorDistributionDictionary[donorFile].Median; } - - /// /// Scores a MBR peak based on it's retention time, ppm error, and intensity /// @@ -83,11 +109,10 @@ internal double GetMedianRtDiff(SpectraFileInfo donorFile) internal double ScoreMbr(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak, double predictedRt) { acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); - acceptorPeak.RtScore = CalculateScore(_rtErrorDistribution, acceptorPeak.ApexRetentionTime - predictedRt); + acceptorPeak.RtScore = CalculateScore(_rtPredictionErrorDistributionDictionary[donorPeak.SpectraFileInfo], + predictedRt - acceptorPeak.ApexRetentionTime); acceptorPeak.PpmScore = CalculateScore(_ppmDistribution, acceptorPeak.MassError); acceptorPeak.ScanCountScore = CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); - - double donorIdPEP = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First().PosteriorErrorProbability; // Returns 100 times the geometric mean of the four scores return 100 * Math.Pow( acceptorPeak.IntensityScore * acceptorPeak.RtScore * acceptorPeak.PpmScore * acceptorPeak.ScanCountScore, 0.25); From 5cbb1c53debc256edeb2f3621dc14e938ec30aab Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 18 Apr 2024 12:09:50 -0500 Subject: [PATCH 43/55] Mostly deleted stuff that will be introduced in a different commit --- mzLib/FlashLFQ/ChromatographicPeak.cs | 36 +- mzLib/FlashLFQ/FlashLfqEngine.cs | 513 ++++------------------- mzLib/FlashLFQ/Identification.cs | 11 +- mzLib/FlashLFQ/MbrScorer.cs | 10 - mzLib/FlashLFQ/RtInfo.cs | 12 +- mzLib/Test/Test.csproj | 6 +- mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs | 374 ----------------- 7 files changed, 98 insertions(+), 864 deletions(-) delete mode 100644 mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 1ab123c75..353909ed1 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -22,8 +22,6 @@ public class ChromatographicPeak public double IntensityScore { get; set; } public double RtScore { get; set; } public double ScanCountScore { get; set; } - public List ChargeList { get; set; } - public string Collision { get; set; } public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) { @@ -36,8 +34,6 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi IsotopicEnvelopes = new List(); IsMbrPeak = isMbrPeak; SpectraFileInfo = fileInfo; - RandomRt = randomRt; - } public IsotopicEnvelope Apex { get; private set; } @@ -50,16 +46,6 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi /// Expected retention time for MBR acceptor peaks (mean) /// public double? RtHypothesis { get; private set; } - /// - /// Std. Dev of retention time differences between MBR acceptor file and donor file, used if # calibration points < 6 - /// - public double? RtStdDev { get; private set; } - /// - /// Interquartile range of retention time differences between MBR acceptor file and donor file, used if # calibration points >= 6 - /// - public double? RtInterquartileRange { get; private set; } - public bool RandomRt { get; } - public bool DecoyPeptide => Identifications.First().IsDecoy; public static string TabSeparatedHeader { @@ -93,27 +79,10 @@ public static string TabSeparatedHeader sb.Append("Full Sequences Mapped" + "\t"); sb.Append("Peak Split Valley RT" + "\t"); sb.Append("Peak Apex Mass Error (ppm)"); - sb.Append("\t" + "Decoy Peptide"); - sb.Append("\t" + "Random Rt"); - sb.Append("\t" + "Collision"); - //sb.Append("Timepoints"); return sb.ToString(); } } - /// - /// Sets retention time information for a given peak. Used for MBR peaks - /// - /// Expected retention time for peak, based on alignment between a donor and acceptor file - /// Standard deviation in the retention time differences between aligned peaks - /// Interquartile range og the retention time differences between aligned peaks - internal void SetRtWindow(double rtHypothesis, double? rtStdDev, double? rtInterquartileRange) - { - RtHypothesis = rtHypothesis; - RtStdDev = rtStdDev; - RtInterquartileRange = rtInterquartileRange; - } - public void CalculateIntensityForThisFeature(bool integrate) { if (IsotopicEnvelopes.Any()) @@ -166,7 +135,7 @@ public void MergeFeatureWith(ChromatographicPeak otherFeature, bool integrate) this.Identifications = this.Identifications .Union(otherFeature.Identifications) .Distinct() - .OrderBy(p => p.PosteriorErrorProbability).ToList(); + .ToList(); ResolveIdentifications(); this.IsotopicEnvelopes.AddRange(otherFeature.IsotopicEnvelopes .Where(p => !thisFeaturesPeaks.Contains(p.IndexedPeak))); @@ -276,9 +245,6 @@ public override string ToString() sb.Append("" + NumIdentificationsByFullSeq + "\t"); sb.Append("" + SplitRT + "\t"); sb.Append("" + MassError); - sb.Append("\t" + DecoyPeptide); - sb.Append("\t" + RandomRt); - sb.Append("\t" + Collision ?? ""); return sb.ToString(); } diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 45111f342..5856742cc 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -39,20 +39,6 @@ public class FlashLfqEngine public readonly bool MatchBetweenRuns; public readonly double MbrRtWindow; public readonly double MbrPpmTolerance; - - // New MBR Settings - public readonly double RtWindowIncrease = 0; - public readonly double MbrAlignmentWindow = 2.5; - //public readonly double? MbrPpmTolerance; - /// - /// Specifies how the donor peak for MBR is selected. - /// 'S' selects the donor peak associated with the highest scoring PSM - /// 'I' selects the donor peak with the max intensity - /// 'N' selects the donor peak with the most neighboring peaks - /// - public char DonorCriterion { get; init; } - public readonly double DonorQValueThreshold; - public readonly bool RequireMsmsIdInCondition; // settings for the Bayesian protein quantification engine @@ -81,10 +67,6 @@ public class FlashLfqEngine private FlashLfqResults _results; internal Dictionary _ms1Scans; internal PeakIndexingEngine _peakIndexingEngine; - internal Dictionary> DonorFileToPeakDict { get; private set; } - internal ConcurrentBag DecoyPeaks { get; private set; } - internal List PeptidesForMbr { get; init; } - public FlashLfqEngine( List allIdentifications, @@ -100,7 +82,7 @@ public FlashLfqEngine( // MBR settings bool matchBetweenRuns = false, - double matchBetweenRunsPpmTolerance = 5.0, + double matchBetweenRunsPpmTolerance = 10.0, double maxMbrWindow = 2.5, bool requireMsmsIdInCondition = false, @@ -112,10 +94,7 @@ public FlashLfqEngine( int mcmcBurninSteps = 1000, bool useSharedPeptidesForProteinQuant = false, bool pairedSamples = false, - int? randomSeed = null, - char donorCriterion = 'I', - double donorQValueThreshold = 0.05, - List peptidesForMbr = null) + int? randomSeed = null) { Loaders.LoadElements(); @@ -133,14 +112,6 @@ public FlashLfqEngine( PpmTolerance = ppmTolerance; IsotopePpmTolerance = isotopeTolerancePpm; MatchBetweenRuns = matchBetweenRuns; - if(MatchBetweenRuns) - { - DecoyPeaks = new(); - if(peptidesForMbr != null) - { - PeptidesForMbr = peptidesForMbr; - } - } MbrPpmTolerance = matchBetweenRunsPpmTolerance; Integrate = integrate; NumIsotopesRequired = numIsotopesRequired; @@ -148,8 +119,6 @@ public FlashLfqEngine( Silent = silent; IdSpecificChargeState = idSpecificChargeState; MbrRtWindow = maxMbrWindow; - DonorCriterion = donorCriterion; - DonorQValueThreshold = donorQValueThreshold; RequireMsmsIdInCondition = requireMsmsIdInCondition; Normalize = normalize; @@ -222,8 +191,6 @@ public FlashLfqResults Run() // do MBR if (MatchBetweenRuns) { - Console.WriteLine("Find the best donors for match-between-runs"); - FindPeptideDonorFiles(); foreach (var spectraFile in _spectraFileInfo) { if (!Silent) @@ -240,7 +207,6 @@ public FlashLfqResults Run() Console.WriteLine("Finished MBR for " + spectraFile.FilenameWithoutExtension); } } - _results.DecoyPeaks = DecoyPeaks; } // normalize @@ -578,90 +544,6 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); } - /// - /// For every MSMS identified peptide, selects one file that will be used as the donor - /// by finding files that contain the most peaks in the local neighborhood, - /// then writes the restults to the DonorFileToIdsDict. - /// WARNING! Strong assumption that this is called BEFORE MBR peaks are identified/assigned to the results - /// - private void FindPeptideDonorFiles() - { - DonorFileToPeakDict = new Dictionary>(); - - Dictionary> seqPeakDict = new(); - seqPeakDict = _results.Peaks.SelectMany(kvp => kvp.Value) - .Where(peak => peak.NumIdentificationsByFullSeq == 1 - && peak.IsotopicEnvelopes.Any() - && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) - .GroupBy(peak => peak.Identifications.First().ModifiedSequence) - .ToDictionary(group => group.Key, group => group.ToList()); - - if(PeptidesForMbr.IsNotNullOrEmpty()) - { - // remove all donor sequences not in PeptidesForMbr - Dictionary> filteredSeqPeakDict = new(); - foreach(string seq in PeptidesForMbr) - { - if(seqPeakDict.TryGetValue(seq, out var value)) - { - filteredSeqPeakDict.Add(seq, value); - } - } - seqPeakDict = filteredSeqPeakDict; - } - - // iterate through each unique sequence - foreach (var sequencePeakListKvp in seqPeakDict) - { - List peaksForPeptide = sequencePeakListKvp.Value; - if (!peaksForPeptide.Any()) - continue; - - ChromatographicPeak bestPeak = null; - switch (DonorCriterion) - { - case 'S': // Select best peak by the PSM score - bestPeak = peaksForPeptide.MaxBy(peak => peak.Identifications.First().PsmScore); - if (bestPeak.Identifications.First().PsmScore > 0) - break; - else // if every ID has a score of zero, let it fall through to the default case - goto default; - case 'N': // Select peak with the most neighboring peaks - int maxPeaks = 0; - foreach (var donorPeak in peaksForPeptide) - { - // Count the number of neighboring peaks with unique peptides - int neighboringPeaksCount = _results.Peaks[donorPeak.SpectraFileInfo] - .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) - .Select(peak => peak.Identifications.First().ModifiedSequence) - .Distinct() - .Count(); - - if (neighboringPeaksCount > maxPeaks) - { - maxPeaks = neighboringPeaksCount; - bestPeak = donorPeak; - } - } - break; - case 'I': // Select the peak with the highest intensity - default: - bestPeak = peaksForPeptide.MaxBy(peak => peak.Intensity); - break; - } - - if (bestPeak == null) continue; - if (DonorFileToPeakDict.ContainsKey(bestPeak.SpectraFileInfo)) - { - DonorFileToPeakDict[bestPeak.SpectraFileInfo].Add(bestPeak); - } - else - { - DonorFileToPeakDict.Add(bestPeak.SpectraFileInfo, new List { bestPeak }); - } - } - } - /// /// Used by MBR. Predicts the retention time of a peak in an acceptor file based on the /// retention time of the peak in the donor file. This is done with a local alignment @@ -679,7 +561,7 @@ internal RtInfo PredictRetentionTime( MbrScorer scorer) { var nearbyCalibrationPoints = new List(); - int numberOfAnchors = 4; // The number of anchor peptides to be used for local alignment. Must be an even number! + int numberOfAnchorsPerSide = 2; // The number of anchor peptides to be used for local alignment (on either side of the donor peptide) // only compare +- 1 fraction if (acceptorSampleIsFractionated && donorSampleIsFractionated) @@ -720,7 +602,7 @@ internal RtInfo PredictRetentionTime( } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); numberOfForwardAnchors++; - if(numberOfForwardAnchors >= numberOfAnchors / 2) // We only want a handful of anchor points + if(numberOfForwardAnchors >= numberOfAnchorsPerSide) // We only want a handful of anchor points { break; } @@ -740,21 +622,16 @@ internal RtInfo PredictRetentionTime( } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); numberOfBackwardsAnchors++; - if (numberOfBackwardsAnchors >= numberOfAnchors / 2) // We only want a handful of anchor points + if (numberOfBackwardsAnchors >= numberOfAnchorsPerSide) // We only want a handful of anchor points { break; } } } - double medianRtDiff; - double rtRange; if (!nearbyCalibrationPoints.Any()) { - // Default rt adjustments return null; - //medianRtDiff = scorer.GetMedianRtDiff(donorPeak.SpectraFileInfo); - //rtRange = scorer.GetRTWindowWidth(donorPeak.SpectraFileInfo); } // calculate difference between acceptor and donor RTs for these RT region @@ -762,17 +639,12 @@ internal RtInfo PredictRetentionTime( .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) .ToList(); - medianRtDiff = rtDiffs.Median(); - rtRange = rtDiffs.InterquartileRange() * 4.5; // This is roughly equivalent to 2 standard deviations - - - double? rtStdDev = null; - double? rtInterquartileRange = null; + double medianRtDiff = rtDiffs.Median(); + double rtRange = rtDiffs.InterquartileRange() * 4.5; // This is roughly equivalent to 2 standard deviations - //TODO: Expand range and see what happens - rtRange = Math.Min(rtRange+RtWindowIncrease, MbrRtWindow+RtWindowIncrease); + rtRange = Math.Min(rtRange, MbrRtWindow); - return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: rtRange); } /// @@ -836,7 +708,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // these are the analytes already identified in this run. we don't need to try to match them from other runs var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks - .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) + .Where(peak => peak.IsotopicEnvelopes.Any()) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol); @@ -862,24 +734,23 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } // this stores the results of MBR - ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); - Random randomGenerator = new Random(); - - // This stores the results of a check where we examine whether MBR can return the same peak as the MSMS peak - ConcurrentDictionary>> doubleCheckPeaks = new(); + var matchBetweenRunsIdentifiedPeaks = new Dictionary>>(); // map each donor file onto this file - foreach (var donorFilePeakListKvp in DonorFileToPeakDict) + foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) { - if (idAcceptorFile.Equals(donorFilePeakListKvp.Key)) + if (idAcceptorFile.Equals(idDonorFile)) { continue; } // this is the list of peaks identified in the other file but not in this one ("ID donor peaks") - List idDonorPeaks = donorFilePeakListKvp.Value - .Where(p => !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) - && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); + List idDonorPeaks = _results.Peaks[idDonorFile].Where(p => + !p.IsMbrPeak + && p.NumIdentificationsByFullSeq == 1 + && p.IsotopicEnvelopes.Any() + && !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) + && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); if (!idDonorPeaks.Any()) { @@ -887,7 +758,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } bool donorSampleIsFractionated = _results.SpectraFiles - .Where(p => p.Condition == donorFilePeakListKvp.Key.Condition && p.BiologicalReplicate == donorFilePeakListKvp.Key.BiologicalReplicate) + .Where(p => p.Condition == idDonorFile.Condition && p.BiologicalReplicate == idDonorFile.BiologicalReplicate) .Select(p => p.Fraction) .Distinct() .Count() > 1; @@ -895,19 +766,21 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // We're only interested in the fold change if the conditions are different. Otherwise, we score based off of the intensities // of the acceptor file if (_spectraFileInfo.Select(p => p.Condition).Distinct().Count() > 1 - && donorFilePeakListKvp.Key.Condition != idAcceptorFile.Condition) + && idDonorFile.Condition != idAcceptorFile.Condition) { scorer.CalculateFoldChangeBetweenFiles(idDonorPeaks); } // generate RT calibration curve - RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile, scorer); + RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile, scorer); // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, idDonorPeaks.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (range, loopState) => { + var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); + for (int i = range.Item1; i < range.Item2; i++) { ChromatographicPeak donorPeak = idDonorPeaks[i]; @@ -915,141 +788,50 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); if (rtInfo == null) continue; - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestAcceptor); - - // Draw a random donor that has an rt sufficiently far enough away - ChromatographicPeak randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; - int randomPeaksSampled = 1; - // multiply for safety, in case the relative rt shifts after alignment - double minimumRtDifference = Math.Min(rtInfo.Width * 1.5, 0.5); - double massDiff = Math.Abs(randomDonor.Identifications.First().PeakfindingMass - donorPeak.Identifications.First().PeakfindingMass); - - while (randomDonor == null - || massDiff < 0.1 - || massDiff > 50 // Need the random one to be relatively close in mass (but not too close!) - || randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence - || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumRtDifference) - { - randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; - massDiff = Math.Abs(randomDonor.Identifications.First().PeakfindingMass - donorPeak.Identifications.First().PeakfindingMass); - if (randomPeaksSampled++ > (rtCalibrationCurve.Length - 1)) - { - randomDonor = null; - break; // Prevent infinite loops - } - } - if (randomDonor == null) continue; - - // Map the random rt onto the new file - RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); - if (decoyRtInfo == null) continue; - // Find a decoy peak using the randomly drawn retention time - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, randomRt:decoyRtInfo.PredictedRt); - if(bestDecoy != null) - { - DecoyPeaks.Add(bestDecoy); - } + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific); } - }); - - HashSet acceptorFileTopMsmsSeqs = rtCalibrationCurve.Where(point => point.DonorFilePeak != null && point.AcceptorFilePeak != null) - .Select(p => p.AcceptorFilePeak.Identifications.First().ModifiedSequence) - .Take(10000) - .ToHashSet(); - - // List of donor peaks where the peptide WAS identified in the acceptor file and the best donor is a different file. - // This will be used for checking the error rate - // For each sequence, we only select one peak corresponding to the PSM with the lowest q-value - List donorPeaksWithMsms = donorFilePeakListKvp.Value - .Where(p => acceptorFileTopMsmsSeqs.Contains(p.Identifications.First().ModifiedSequence) - && p.SpectraFileInfo != idAcceptorFile - && !p.Identifications.First().IsDecoy) - .Take(2500) - .ToList(); - - if (donorPeaksWithMsms.Count < 1) continue; - - // Loop through every MSMS id in the donor file - Parallel.ForEach(Partitioner.Create(0, donorPeaksWithMsms.Count), - new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, - (range, loopState) => - { - for (int i = range.Item1; i < range.Item2; i++) + lock (matchBetweenRunsIdentifiedPeaks) { - ChromatographicPeak donorPeak = donorPeaksWithMsms[i]; - // TODO: Add a toggle that set rtRange to be maximum width - RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); - if (rtInfo == null) continue; - - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, doubleCheckPeaks, out var bestAcceptor); - - // Then look for a peak decoy - // Draw a random donor that has an rt sufficiently far enough away - ChromatographicPeak randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; - int randomPeaksSampled = 1; - // multiply for safety, in case the relative rt shifts after alignment - double minimumRtDifference = Math.Min(rtInfo.Width * 1.5, 0.5); - double massDiff = Math.Abs(randomDonor.Identifications.First().PeakfindingMass - donorPeak.Identifications.First().PeakfindingMass); - - while (randomDonor == null - || massDiff < 0.1 - || massDiff > 50 // Need the random one to be relatively close in mass (but not too close!) - || randomDonor.Identifications.First().ModifiedSequence == donorPeak.Identifications.First().ModifiedSequence - || Math.Abs(randomDonor.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime) < minimumRtDifference) + foreach (var kvp in matchBetweenRunsIdentifiedPeaksThreadSpecific) { - randomDonor = rtCalibrationCurve[randomGenerator.Next(rtCalibrationCurve.Length)].DonorFilePeak; - massDiff = Math.Abs(randomDonor.Identifications.First().PeakfindingMass - donorPeak.Identifications.First().PeakfindingMass); - if (randomPeaksSampled++ > (rtCalibrationCurve.Length - 1)) + if (matchBetweenRunsIdentifiedPeaks.TryGetValue(kvp.Key, out var list)) { - randomDonor = null; - break; // Prevent infinite loops + foreach (var peak in kvp.Value) + { + if (list.TryGetValue(peak.Key, out List existing)) + { + foreach (var acceptorPeak in peak.Value) + { + var samePeakSameSequence = existing + .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); + + if (samePeakSameSequence != null) + { + samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; + samePeakSameSequence.Identifications.Add(acceptorPeak.Identifications.First()); + } + else + { + existing.Add(acceptorPeak); + } + } + } + else + { + list.Add(peak.Key, peak.Value); + } + } + } + else + { + matchBetweenRunsIdentifiedPeaks.Add(kvp.Key, kvp.Value); } - } - if (randomDonor == null) continue; - - // Map the random rt onto the new file - RtInfo decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); - if (decoyRtInfo == null) continue; - // Find a decoy peak using the randomly drawn retention time - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaks, out var bestDecoy, randomRt: decoyRtInfo.PredictedRt); - - // store occasions where no peak was found. This allows us to calculate sensitivity - if (bestAcceptor == null && bestDecoy == null) - { - doubleCheckPeaks.TryAdd( - key: donorPeak.Identifications.First().ModifiedSequence, - value: null); } } }); } - // Eliminate duplicate peaks (not sure where they come from) - foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) - { - // Each isotopic envelope is linked to a list of ChromatographicPeaks - // Here, we remove instances where the same envelope is associated with multiple chromatographic peaks but the peaks correspond to the same donor peptide - // I don't know why this happens lol - // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications, then they're kept separate. - foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) - { - List bestPeaks = new(); - foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) - { - bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); - } - envelopePeakListKvp.Value.Clear(); - envelopePeakListKvp.Value.AddRange(bestPeaks); - } - } - - // Create a dictionary that stores imsPeak associated with an ms/ms identified peptide - Dictionary> msmsImsPeaks = _results.Peaks[idAcceptorFile].Where(peak => peak.Apex?.IndexedPeak != null) - .Select(peak => peak.Apex.IndexedPeak) - .GroupBy(imsPeak => imsPeak.ZeroBasedMs1ScanIndex) - .ToDictionary(g => g.Key, g => g.ToList()); - // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -1060,80 +842,9 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); - ChromatographicPeak best = peakHypotheses.First(); - peakHypotheses.Remove(best); - // Discard any peaks that are already associated with an ms/ms identified peptide - while(best.Apex?.IndexedPeak != null && msmsImsPeaks.TryGetValue(best.Apex.IndexedPeak.ZeroBasedMs1ScanIndex, out var peakList)) - { - if(peakList.Contains(best.Apex.IndexedPeak)) - { - if(!peakHypotheses.Any()) - { - best = null; - break; - } - best = peakHypotheses.First(); - peakHypotheses.Remove(best); - } - else - { - break; - } - } - if (best == null) continue; - - // merge peaks with different charge states - if (peakHypotheses.Count > 0) - { - double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); - double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); - - foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) - { - if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) - { - best.MergeFeatureWith(peak, Integrate); - } - } - } - _results.Peaks[idAcceptorFile].Add(best); - } - - // repeat basically the same procedure for the double-check peaks - foreach (var seqDictionaryKvp in doubleCheckPeaks.Where(kvp => kvp.Value != null)) - { - // Each isotopic envelope is linked to a list of ChromatographicPeaks - // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications and they're kept separate. - foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) - { - List bestPeaks = new(); - foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) - { - bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); - } - envelopePeakListKvp.Value.Clear(); - envelopePeakListKvp.Value.AddRange(bestPeaks); - } - } - - // take the best result (highest scoring) for each peptide after we've matched from all the donor files - foreach (var mbrIdentifiedPeptide in doubleCheckPeaks.Where(p => acceptorFileIdentifiedSequences.Contains(p.Key))) - { - string peptideModifiedSequence = mbrIdentifiedPeptide.Key; - List msmsPeaks = acceptorFileIdentifiedPeaks - .Where(peak => peak.Apex != null && peak.Identifications.First().ModifiedSequence.Equals(peptideModifiedSequence)).ToList(); - if (!msmsPeaks.Any()) continue; //This shouldn't happen - if (mbrIdentifiedPeptide.Value == null) - { - ChromatographicPeak nullPeak = new ChromatographicPeak(msmsPeaks.First().Identifications.First(), isMbrPeak: false, idAcceptorFile); - nullPeak.Collision = "0"; // Zero represent peak not found - _results.DoubleCheckPeaks[idAcceptorFile].Add(nullPeak); - continue; - } + ChromatographicPeak best = peakHypotheses.First(); - List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); - ChromatographicPeak best = peakHypotheses.FirstOrDefault(); peakHypotheses.Remove(best); if (peakHypotheses.Count > 0) @@ -1153,43 +864,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } } - if (best == null || best.Apex == null) - { - ChromatographicPeak nullPeak = new ChromatographicPeak(msmsPeaks.First().Identifications.First(), isMbrPeak: false, idAcceptorFile); - nullPeak.Collision = "0"; // Zero represent peak not found - _results.DoubleCheckPeaks[idAcceptorFile].Add(nullPeak); - continue; - } - - if (msmsPeaks.Any(peak => peak.Apex.Equals(best.Apex))) - { - best.Collision = "1"; // One is best possible - } - else - { - var test = msmsPeaks.Where(peak => Math.Abs(peak.Apex.IndexedPeak.RetentionTime - best.Apex.IndexedPeak.RetentionTime) < 0.0001); - if (test.IsNotNullOrEmpty()) - { - best.Collision = "2"; // Assumed same time, different charge state - } - else - { - test = msmsPeaks.Where(peak => - { - var rts = peak.IsotopicEnvelopes.Select(e => e.IndexedPeak.RetentionTime); - return best.ApexRetentionTime >= rts.Minimum() && best.ApexRetentionTime <= rts.Maximum(); - }); - if (test.IsNotNullOrEmpty()) - { - best.Collision = "3"; // Overlap peak - } - else - { - best.Collision = "-1"; - } - } - } - _results.DoubleCheckPeaks[idAcceptorFile].Add(best); + _results.Peaks[idAcceptorFile].Add(best); } @@ -1211,25 +886,21 @@ internal void FindAllAcceptorPeaks( RtInfo rtInfo, Tolerance fileSpecificTol, ChromatographicPeak donorPeak, - ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks, - out ChromatographicPeak bestAcceptor, - double? randomRt = null) + Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific) { // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; - double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - (rtInfo.Width / 2.0); - double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + (rtInfo.Width / 2.0); for (int j = 0; j < ms1ScanInfos.Length; j++) { Ms1ScanInfo scan = ms1ScanInfos[j]; - if (scan.RetentionTime <= rtStartHypothesis) + if (scan.RetentionTime <= rtInfo.RtStartHypothesis) { start = scan; } - if (scan.RetentionTime >= rtEndHypothesis) + if (scan.RetentionTime >= rtInfo.RtEndHypothesis) { end = scan; break; @@ -1245,7 +916,6 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - bestAcceptor = null; foreach (int z in chargesToMatch) { @@ -1267,38 +937,37 @@ internal void FindAllAcceptorPeaks( while (chargeEnvelopes.Any()) { ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, - fileSpecificTol, rtInfo, z, chargeEnvelopes, randomRt); + fileSpecificTol, rtInfo, z, chargeEnvelopes); if (acceptorPeak == null) - continue; - if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) - { - acceptorPeak.ChargeList = chargesToMatch; - bestAcceptor = acceptorPeak; - } - + continue; // save the peak hypothesis - matchBetweenRunsIdentifiedPeaks.AddOrUpdate - ( - // new key - key: donorIdentification.ModifiedSequence, - // if we are adding a value for the first time, we simply create a new dictionatry with one entry - addValueFactory: (sequenceKey) => - new ConcurrentDictionary>( - new Dictionary> + if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) + { + if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) + { + var samePeakSameSequence = existing + .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); + + if (samePeakSameSequence != null) + { + samePeakSameSequence.Identifications.Add(donorIdentification); + } + else { - { acceptorPeak.Apex, new List { acceptorPeak } } - }), - // if the key (sequence) already exists, we have to add the new peak to the existing dictionary - updateValueFactory: (sequenceKey, envelopePeakListDict) => + existing.Add(acceptorPeak); + } + } + else { - envelopePeakListDict.AddOrUpdate( - key: acceptorPeak.Apex, - addValueFactory: (envelopeKey) => new List { acceptorPeak }, // if the key (envelope) doesnt exist, just create a new list - updateValueFactory: (envelopeKey, peakList) => { peakList.Add(acceptorPeak); return peakList; }); // if the key (envelope) already exists, add the peak to the associated list - return envelopePeakListDict; + mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); } - ); + } + else + { + matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); + matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); + } } } } @@ -1321,11 +990,10 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( Tolerance mbrTol, RtInfo rtInfo, int z, - List chargeEnvelopes, - double? randomRt = null) + List chargeEnvelopes) { var donorId = donorPeak.Identifications.OrderBy(p => p.QValue).First(); - var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, randomRt != null); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile); // Grab the first scan/envelope from charge envelopes. This should be the most intense envelope in the list IsotopicEnvelope seedEnv = chargeEnvelopes.First(); @@ -1333,7 +1001,6 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( List bestChargeEnvelopes = GetIsotopicEnvelopes(xic, donorId, z); acceptorPeak.IsotopicEnvelopes.AddRange(bestChargeEnvelopes); acceptorPeak.CalculateIntensityForThisFeature(Integrate); - acceptorPeak.SetRtWindow(rtInfo.PredictedRt, rtInfo.RtSd, rtInfo.RtInterquartileRange); CutPeak(acceptorPeak, seedEnv.IndexedPeak.RetentionTime); @@ -1349,7 +1016,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return null; } - acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, randomRt ?? rtInfo.PredictedRt); + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, rtInfo.PredictedRt); return acceptorPeak; } diff --git a/mzLib/FlashLFQ/Identification.cs b/mzLib/FlashLFQ/Identification.cs index 4abee4105..85c557c3e 100644 --- a/mzLib/FlashLFQ/Identification.cs +++ b/mzLib/FlashLFQ/Identification.cs @@ -15,16 +15,11 @@ public class Identification public readonly ChemicalFormula OptionalChemicalFormula; public readonly bool UseForProteinQuant; public double PeakfindingMass; - public double PosteriorErrorProbability; - public double PsmScore { get; init; } - public double QValue { get; init; } - public bool IsDecoy { get; } public Identification(SpectraFileInfo fileInfo, string BaseSequence, string ModifiedSequence, double monoisotopicMass, double ms2RetentionTimeInMinutes, int chargeState, List proteinGroups, - ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true, double posteriorErrorProbability = 0, - double psmScore = 0, double qValue = 0, bool decoy = false) + ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true) { this.FileInfo = fileInfo; this.BaseSequence = BaseSequence; @@ -35,10 +30,6 @@ public Identification(SpectraFileInfo fileInfo, string BaseSequence, string Modi this.ProteinGroups = new HashSet(proteinGroups); this.OptionalChemicalFormula = optionalChemicalFormula; UseForProteinQuant = useForProteinQuant; - PosteriorErrorProbability = posteriorErrorProbability; - QValue = qValue; - PsmScore = psmScore; - IsDecoy = decoy; } public override string ToString() diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 261023c0a..954361a3f 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -138,16 +138,6 @@ internal double CalculateIntensityScore(double acceptorIntensity, Chromatographi { var logIntensity = Math.Log(acceptorIntensity, 2); return CalculateScore(_logIntensityDistribution, logIntensity); - - - // I don't know what the if/else statement accomplishes. It feels like we should take the density regardless - // As it is, the score is artifically inflated for very intense peaks - //if (logIntensity < _logIntensityDistribution.Median) - // intensityDensity = _logIntensityDistribution.Density(logIntensity); - //else - // intensityDensity = _logIntensityDistribution.Density(_logIntensityDistribution.Mode); - - //alternate, more straightforward approach } } diff --git a/mzLib/FlashLFQ/RtInfo.cs b/mzLib/FlashLFQ/RtInfo.cs index e6c4552ad..613aedd43 100644 --- a/mzLib/FlashLFQ/RtInfo.cs +++ b/mzLib/FlashLFQ/RtInfo.cs @@ -10,17 +10,15 @@ public class RtInfo { public double PredictedRt { get; } public double Width { get; } - public double? RtSd { get; } - public double? RtInterquartileRange { get; } - public double RtStartHypothesis => PredictedRt - Math.Max((Width / 2.0), 0.25); - public double RtEndHypothesis => PredictedRt + Math.Max((Width / 2.0), 0.25); + // the Math.Max components ensure that the width of an RT Window is at least _minimumWindowWidth wide + private double _minimumWindowWidth = 0.5; + public double RtStartHypothesis => PredictedRt - Math.Max((Width / 2.0), _minimumWindowWidth/2); + public double RtEndHypothesis => PredictedRt + Math.Max((Width / 2.0), _minimumWindowWidth/2); - public RtInfo(double predictedRt, double width, double? rtSd, double? rtInterquartileRange) + public RtInfo(double predictedRt, double width) { PredictedRt = predictedRt; Width = width; - RtSd = rtSd; - RtInterquartileRange = rtInterquartileRange; } } } diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index aebc282ad..9af65a25b 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -13,15 +13,11 @@ - - runtime; build; native; contentfiles; analyzers; buildtransitive - all - + - diff --git a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs b/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs deleted file mode 100644 index a136647d6..000000000 --- a/mzLib/TestFlashLFQ/MbrTargetDecoyTest.cs +++ /dev/null @@ -1,374 +0,0 @@ -using Chemistry; -using FlashLFQ; -using MassSpectrometry; -using MathNet.Numerics.Distributions; -using MathNet.Numerics.Statistics; -using MzLibUtil; -using NUnit.Framework; -using Proteomics.AminoAcidPolymer; -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using Easy.Common.Extensions; -using Test.FileReadingTests; -using UsefulProteomicsDatabases; -using ChromatographicPeak = FlashLFQ.ChromatographicPeak; -using Stopwatch = System.Diagnostics.Stopwatch; -using Peptide = Proteomics.AminoAcidPolymer.Peptide; -using System.Windows.Shapes; - -namespace Test -{ - [TestFixture] - [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] - internal class MbrTargetDecoyTest - { - [Test] - [TestCase(0, ExpectedResult = 0)] - [TestCase(1, ExpectedResult = -1)] - [TestCase(2, ExpectedResult = 1)] - [TestCase(3, ExpectedResult = -2)] - [TestCase(5, ExpectedResult = -3)] - [TestCase(6, ExpectedResult = 3)] - public static int TestDecoySearchFlipFlop(int searchCount) - { - // Integer division take ceiling: https://stackoverflow.com/questions/17944/how-to-round-up-the-result-of-integer-division - int result = (searchCount + 2 - 1) / 2; - result = searchCount % 2 == 0 ? result : -1 * result; - - return result; - } - - [Test] - // This is gonna have a bunch of local file references, just a heads up. Dont make github try and build this one - public static void TwoFileMbrTest() - { - //string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\subset_psms.psmtsv"; - string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\AllPSMs_1PercentFdr.psmtsv"; - - SpectraFileInfo j5 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J5.raw", "a", 0, 0, 0); - SpectraFileInfo j6 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J6.raw", "a", 1, 0, 0); - - List ids = new List(); - Dictionary allProteinGroups = new Dictionary(); - foreach (string line in File.ReadAllLines(psmFile)) - { - var split = line.Split(new char[] { '\t' }); - - if (split.Contains("File Name") || string.IsNullOrWhiteSpace(line)) - { - continue; - } - - SpectraFileInfo file = null; - - if (split[0].Contains("J5")) - { - file = j5; - } - else if (split[0].Contains("J6")) - { - file = j6; - } - - string baseSequence = split[12]; - string fullSequence = split[13]; - double monoMass = double.Parse(split[22]); - double rt = double.Parse(split[2]); - int z = (int)double.Parse(split[6]); - var proteins = split[25].Split(new char[] { '|' }); - List proteinGroups = new List(); - foreach (var protein in proteins) - { - if (allProteinGroups.TryGetValue(protein, out var proteinGroup)) - { - proteinGroups.Add(proteinGroup); - } - else - { - allProteinGroups.Add(protein, new ProteinGroup(protein, "", "Homo Sapiens")); - proteinGroups.Add(allProteinGroups[protein]); - } - } - - bool isDecoy = split[32] == "Y"; - double score = double.TryParse(split[9], out var s) ? s : 0; - - Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy, psmScore: score); - ids.Add(id); - } - - - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5, donorCriterion: 'S'); - var results = engine.Run(); - - var test = results.Peaks.Values.SelectMany(peakList => peakList).ToList(); - int place = 0; - - List mbrPeaks = new(); - - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.RandomRt && !peak.DecoyPeptide).ToList()); - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide && !peak.RandomRt).ToList()); - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide && peak.RandomRt).ToList()); - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); - - - using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_1PepTest.tsv")) - { - writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - foreach (var peak in mbrPeaks) - { - writer.WriteLine(peak); - } - } - - //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) - //{ - // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - // foreach (var peak in engine.DecoyPeaks) - // { - // writer.WriteLine(peak); - // } - //} - - var f1r1MbrResults = results - .PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MBR && p.Value.GetDetectionType(j6) == DetectionType.MSMS).ToList(); - - Assert.That(f1r1MbrResults.Count >= 132); - - results.WriteResults(peaksOutputPath: @"C:\Users\Alex\Desktop\FlashTest\AllPeaks.tsv", null, null, null, true); - - var f1r2MbrResults = results.PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MSMS && p.Value.GetDetectionType(j6) == DetectionType.MBR).ToList(); - - Assert.That(f1r2MbrResults.Count >= 77); - - List<(double, double)> peptideIntensities = new List<(double, double)>(); - - foreach (var peptide in f1r1MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(j5)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(j6)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } - - double corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.Greater(corr, 0.8); - - peptideIntensities.Clear(); - foreach (var peptide in f1r2MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(j6)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(j5)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } - - corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - - Assert.That(corr > 0.7); - - // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein - // has to be observed in a condition for match-between-runs - j5.Condition = "b"; - engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 5); - results = engine.Run(); - var proteinsObservedInF1 = ids.Where(p => p.FileInfo == j5).SelectMany(p => p.ProteinGroups).Distinct().ToList(); - var proteinsObservedInF2 = ids.Where(p => p.FileInfo == j6).SelectMany(p => p.ProteinGroups).Distinct().ToList(); - var proteinsObservedInF1ButNotF2 = proteinsObservedInF1.Except(proteinsObservedInF2).ToList(); - foreach (ProteinGroup protein in proteinsObservedInF1ButNotF2) - { - Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(j6) == 0); - } - } - - - [Test] - // This is gonna have a bunch of local file references, just a heads up. Dont make github try and build this one - public static void ThreeFileMbrTest() - { - //string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\subset_psms.psmtsv"; - string psmFile = @"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\AllPSMs_1PercentFdr.psmtsv"; - string psmFile2 = @"D:\SingleCellDataSets\Organoid\Search_MM_320\Task1-SearchTask\Individual File Results\HFL1SC_Unhealthy_CH2_J7-calib_PSMs.psmtsv"; - - SpectraFileInfo j5 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J5.raw", "a", 0, 0, 0); - SpectraFileInfo j6 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\raw_files\HFL1SC_Unhealthy_CH2_J6.raw", "a", 1, 0, 0); - SpectraFileInfo j7 = new SpectraFileInfo(@"D:\SingleCellDataSets\Organoid\Calibration_MM_320\Task1-CalibrateTask\HFL1SC_Unhealthy_CH2_J7-calib.mzML", "a", 2, 0, 0); - - - List ids = new List(); - Dictionary allProteinGroups = new Dictionary(); - foreach (string line in File.ReadAllLines(psmFile)) - { - var split = line.Split(new char[] { '\t' }); - - if (split.Contains("File Name") || string.IsNullOrWhiteSpace(line)) - { - continue; - } - - SpectraFileInfo file = null; - - if (split[0].Contains("J5")) - { - file = j5; - } - else if (split[0].Contains("J6")) - { - file = j6; - } - - string baseSequence = split[12]; - string fullSequence = split[13]; - double monoMass = double.Parse(split[22]); - double rt = double.Parse(split[2]); - int z = (int)double.Parse(split[6]); - var proteins = split[25].Split(new char[] { '|' }); - List proteinGroups = new List(); - foreach (var protein in proteins) - { - if (allProteinGroups.TryGetValue(protein, out var proteinGroup)) - { - proteinGroups.Add(proteinGroup); - } - else - { - allProteinGroups.Add(protein, new ProteinGroup(protein, "", "")); - proteinGroups.Add(allProteinGroups[protein]); - } - } - - bool isDecoy = split[32] == "Y"; - - Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy); - ids.Add(id); - } - foreach(string line in File.ReadAllLines(psmFile2)) - { - var split = line.Split(new char[] { '\t' }); - - if (split.Contains("File Name") || string.IsNullOrWhiteSpace(line)) - { - continue; - } - - SpectraFileInfo file = j7; - - double qval = Double.Parse(split[50]); - if (qval > 0.01) continue; - - string baseSequence = split[12]; - string fullSequence = split[13]; - if(!double.TryParse(split[22], out var x)) - { - continue; // Occurs for ambiguous peptides - } - double monoMass = double.Parse(split[22]); - double rt = double.Parse(split[2]); - int z = (int)double.Parse(split[6]); - var proteins = split[25].Split(new char[] { '|' }); - List proteinGroups = new List(); - foreach (var protein in proteins) - { - if (allProteinGroups.TryGetValue(protein, out var proteinGroup)) - { - proteinGroups.Add(proteinGroup); - } - else - { - allProteinGroups.Add(protein, new ProteinGroup(protein, "", "")); - proteinGroups.Add(allProteinGroups[protein]); - } - } - - bool isDecoy = split[32] == "Y"; - double score = double.TryParse(split[9], out var s) ? s : 0; - - Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: isDecoy, psmScore: score); - ids.Add(id); - } - - - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5, donorCriterion: 'S'); - var results = engine.Run(); - - var test = results.Peaks.Values.SelectMany(peakList => peakList).ToList(); - int place = 0; - - List mbrPeaks = new(); - - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.RandomRt && !peak.DecoyPeptide).ToList()); - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide && !peak.RandomRt).ToList()); - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && peak.DecoyPeptide && peak.RandomRt).ToList()); - mbrPeaks.AddRange(test.Where(peak => peak.IsMbrPeak && !peak.DecoyPeptide & !peak.RandomRt).ToList()); - - - //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\MbrResults_1PeakPerPepScore.tsv")) - //{ - // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - // foreach (var peak in mbrPeaks) - // { - // writer.WriteLine(peak); - // } - //} - - //using (StreamWriter writer = new StreamWriter(@"D:\SingleCellDataSets\Organoid\TwoFileSearch\Task1-SearchTask\RealMBR\AllDecoys_minRtDiff.tsv")) - //{ - // writer.WriteLine(ChromatographicPeak.TabSeparatedHeader); - // foreach (var peak in engine.DecoyPeaks) - // { - // writer.WriteLine(peak); - // } - //} - - var f1r1MbrResults = results - .PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MBR && p.Value.GetDetectionType(j6) == DetectionType.MSMS).ToList(); - - Assert.That(f1r1MbrResults.Count >= 132); - - var f1r2MbrResults = results.PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(j5) == DetectionType.MSMS && p.Value.GetDetectionType(j6) == DetectionType.MBR).ToList(); - - Assert.That(f1r2MbrResults.Count >= 77); - - List<(double, double)> peptideIntensities = new List<(double, double)>(); - - foreach (var peptide in f1r1MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(j5)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(j6)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } - - double corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.Greater(corr, 0.8); - - peptideIntensities.Clear(); - foreach (var peptide in f1r2MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(j6)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(j5)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } - - corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - - Assert.That(corr > 0.7); - - // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein - // has to be observed in a condition for match-between-runs - j5.Condition = "b"; - engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 5); - results = engine.Run(); - var proteinsObservedInF1 = ids.Where(p => p.FileInfo == j5).SelectMany(p => p.ProteinGroups).Distinct().ToList(); - var proteinsObservedInF2 = ids.Where(p => p.FileInfo == j6).SelectMany(p => p.ProteinGroups).Distinct().ToList(); - var proteinsObservedInF1ButNotF2 = proteinsObservedInF1.Except(proteinsObservedInF2).ToList(); - foreach (ProteinGroup protein in proteinsObservedInF1ButNotF2) - { - Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(j6) == 0); - } - } - } -} From c5249aa9cadd0e0b09b5374abcab8d49ac45458e Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 18 Apr 2024 12:29:25 -0500 Subject: [PATCH 44/55] Fixed all tests but one --- mzLib/FlashLFQ/FlashLFQResults.cs | 36 +---------------------- mzLib/FlashLFQ/FlashLfqEngine.cs | 4 +-- mzLib/FlashLFQ/IndexedMassSpectralPeak.cs | 1 - mzLib/FlashLFQ/MbrScorer.cs | 6 ++++ mzLib/Test/TestDeconvolution.cs | 11 ++++--- mzLib/TestFlashLFQ/TestFlashLFQ.cs | 10 ++----- 6 files changed, 17 insertions(+), 51 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index 4e8a11bfb..45da8d2d0 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -14,8 +14,6 @@ public class FlashLfqResults public readonly Dictionary PeptideModifiedSequences; public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; - public readonly Dictionary> DoubleCheckPeaks; - public IEnumerable DecoyPeaks { get; set; } public FlashLfqResults(List spectraFiles, List identifications) { @@ -23,12 +21,10 @@ public FlashLfqResults(List spectraFiles, List PeptideModifiedSequences = new Dictionary(); ProteinGroups = new Dictionary(); Peaks = new Dictionary>(); - DoubleCheckPeaks = new Dictionary>(); foreach (SpectraFileInfo file in spectraFiles) { Peaks.Add(file, new List()); - DoubleCheckPeaks.Add(file, new List()); } foreach (Identification id in identifications) @@ -544,7 +540,7 @@ public void CalculateProteinResultsMedianPolish(bool useSharedPeptides) } } - public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent, string decoyPath = null) + public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent = true) { if (!silent) { @@ -566,36 +562,6 @@ public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, st } } - string[] pathSplit = peaksOutputPath.Split(Path.DirectorySeparatorChar); - pathSplit[^1] = "DoubleCheckedPeaks.tsv"; - - using (var output = new StreamWriter(String.Join(Path.DirectorySeparatorChar, pathSplit))) - { - output.WriteLine(ChromatographicPeak.TabSeparatedHeader); - - foreach (var peak in DoubleCheckPeaks.SelectMany(p => p.Value) - .OrderBy(p => p.SpectraFileInfo.FilenameWithoutExtension) - .ThenByDescending(p => p.Collision)) - { - output.WriteLine(peak.ToString()); - } - } - - if (decoyPath != null & DecoyPeaks.IsNotNullOrEmpty()) - { - using (StreamWriter output = new StreamWriter(decoyPath)) - { - output.WriteLine(ChromatographicPeak.TabSeparatedHeader); - - foreach (var peak in DecoyPeaks - .OrderBy(p => p.SpectraFileInfo.FilenameWithoutExtension) - .ThenByDescending(p => p.MbrScore)) - { - output.WriteLine(peak.ToString()); - } - } - } - if (modPeptideOutputPath != null) { using (StreamWriter output = new StreamWriter(modPeptideOutputPath)) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 5856742cc..069253d58 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -915,7 +915,7 @@ internal void FindAllAcceptorPeaks( chargesToMatch.Add(donorPeak.Apex.ChargeState); } - Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); + Identification donorIdentification = donorPeak.Identifications.First(); foreach (int z in chargesToMatch) { @@ -992,7 +992,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( int z, List chargeEnvelopes) { - var donorId = donorPeak.Identifications.OrderBy(p => p.QValue).First(); + var donorId = donorPeak.Identifications.First(); var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile); // Grab the first scan/envelope from charge envelopes. This should be the most intense envelope in the list diff --git a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs index eb76d54c4..c9aa89042 100644 --- a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs +++ b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs @@ -23,7 +23,6 @@ public override bool Equals(object obj) var otherPeak = (IndexedMassSpectralPeak)obj; return otherPeak != null - //&& Math.Abs(otherPeak.Mz - this.Mz) < 0.00000001 && otherPeak.Mz == this.Mz && otherPeak.ZeroBasedMs1ScanIndex == this.ZeroBasedMs1ScanIndex; } diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 954361a3f..c5c375fd0 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -80,6 +80,12 @@ internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List rtPredictionErrors.Add(avgDiff - anchorPeptideRtDiffs[i]); } + if(!rtPredictionErrors.Any()) + { + _rtPredictionErrorDistributionDictionary.Add(donorFile, new Normal(0, 1)); + return; + } + double medianRtError = rtPredictionErrors.Median(); double stdDevRtError = rtPredictionErrors.StandardDeviation(); diff --git a/mzLib/Test/TestDeconvolution.cs b/mzLib/Test/TestDeconvolution.cs index f8a9e3c7e..7fabc4751 100644 --- a/mzLib/Test/TestDeconvolution.cs +++ b/mzLib/Test/TestDeconvolution.cs @@ -182,14 +182,13 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid Deconvoluter deconvoluter = new Deconvoluter(DeconvolutionType.ClassicDeconvolution, deconParameters); //check assigned correctly - - List lie2 = deconvoluter.Deconvolute(singlespec, singleRange).ToList(); - List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); - Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); + //List lie2 = deconvoluter.Deconvolute(singlespec, singleRange).ToList(); + //List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); + //Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); //check that if already assigned, skips assignment and just recalls same value - List lie3 = deconvoluter.Deconvolute(singlespec, singleRange).ToList(); - Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); + //List lie3 = deconvoluter.Deconvolute(singlespec, singleRange).ToList(); + //Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); } #endregion diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index c5f5497de..772768b01 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -585,9 +585,6 @@ public static void TestFlashLfqMatchBetweenRuns() if (i == 2) continue; // exclude the mbr peak from the calculation rtDiffs.Add(Math.Abs(file1Rt[i] - file2Rt[i])); } - Assert.That(peak.RtStdDev.HasValue); - Assert.That(!peak.RtInterquartileRange.HasValue); - Assert.That(peak.RtStdDev, Is.EqualTo(rtDiffs.StandardDeviation()).Within(0.01)); Assert.That(results.Peaks[file1].Count == 5); Assert.That(!results.Peaks[file1].Any(p => p.IsMbrPeak)); @@ -603,9 +600,6 @@ public static void TestFlashLfqMatchBetweenRuns() if (i == 2) continue; // exclude the mbr peak from the calculation rtDiffs.Add(Math.Abs(file1Rt[i] - file2Rt[i])); } - Assert.That(!peak.RtStdDev.HasValue); - Assert.That(peak.RtInterquartileRange.HasValue); - Assert.That(peak.RtInterquartileRange, Is.EqualTo(rtDiffs.InterquartileRange()).Within(0.01)); } [Test] @@ -1368,7 +1362,9 @@ public static void RealDataMbrTest() corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.That(corr > 0.7); + // Making MBR more permissive (i.e., minimum RT Window width) results in more MBR-detections but a lower corr + // corr should increase once we introduce target/decoy and score filtering + Assert.That(corr > 0.65); // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein // has to be observed in a condition for match-between-runs From 95f92812bd7a2322b28f687882a6da5758e399d5 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 18 Apr 2024 12:31:40 -0500 Subject: [PATCH 45/55] Fixed final test --- mzLib/TestFlashLFQ/TestFlashLFQ.cs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 772768b01..040498f96 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -577,8 +577,6 @@ public static void TestFlashLfqMatchBetweenRuns() Assert.That(peak.Intensity > 0); Assert.That(peak.Intensity == otherFilePeak.Intensity); - Assert.That(peak.RtHypothesis.HasValue); - Assert.That(peak.RtHypothesis, Is.EqualTo(1.03).Within(0.01)); List rtDiffs = new(); for (int i = 0; i < 5; i++) { @@ -593,8 +591,6 @@ public static void TestFlashLfqMatchBetweenRuns() results = interquartileEngine.Run(); peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); - Assert.That(peak.RtHypothesis.HasValue); - Assert.That(peak.RtHypothesis, Is.EqualTo(1.04).Within(0.01)); for (int i = 0; i < 5; i++) { if (i == 2) continue; // exclude the mbr peak from the calculation From e6294a9d442c3b8f20ddaea3fbeb1d5875850f16 Mon Sep 17 00:00:00 2001 From: Alex Date: Thu, 18 Apr 2024 14:04:51 -0500 Subject: [PATCH 46/55] minor --- mzLib/FlashLFQ/RtInfo.cs | 11 +++++++---- mzLib/Test/TestDeconvolution.cs | 10 +++++----- mzLib/TestFlashLFQ/TestFlashLFQ.cs | 4 +--- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/mzLib/FlashLFQ/RtInfo.cs b/mzLib/FlashLFQ/RtInfo.cs index 613aedd43..0750e4588 100644 --- a/mzLib/FlashLFQ/RtInfo.cs +++ b/mzLib/FlashLFQ/RtInfo.cs @@ -10,10 +10,13 @@ public class RtInfo { public double PredictedRt { get; } public double Width { get; } - // the Math.Max components ensure that the width of an RT Window is at least _minimumWindowWidth wide - private double _minimumWindowWidth = 0.5; - public double RtStartHypothesis => PredictedRt - Math.Max((Width / 2.0), _minimumWindowWidth/2); - public double RtEndHypothesis => PredictedRt + Math.Max((Width / 2.0), _minimumWindowWidth/2); + public double RtStartHypothesis => PredictedRt - (Width / 2.0); + public double RtEndHypothesis => PredictedRt + (Width / 2.0); + + // These will be introduced in a later PR. For now, we're sticking with the classic version + //private double _minimumWindowWidth = 0.5; + //public double RtStartHypothesis => PredictedRt - Math.Max((Width / 2.0), _minimumWindowWidth/2); // the Math.Max components ensure that the width of an RT Window is at least _minimumWindowWidth wide + //public double RtEndHypothesis => PredictedRt + Math.Max((Width / 2.0), _minimumWindowWidth/2); public RtInfo(double predictedRt, double width) { diff --git a/mzLib/Test/TestDeconvolution.cs b/mzLib/Test/TestDeconvolution.cs index 7fabc4751..f0dc76e0f 100644 --- a/mzLib/Test/TestDeconvolution.cs +++ b/mzLib/Test/TestDeconvolution.cs @@ -182,13 +182,13 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid Deconvoluter deconvoluter = new Deconvoluter(DeconvolutionType.ClassicDeconvolution, deconParameters); //check assigned correctly - //List lie2 = deconvoluter.Deconvolute(singlespec, singleRange).ToList(); - //List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); - //Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); + List lie2 = Deconvoluter.Deconvolute(singlespec, deconParameters, singleRange).ToList(); + List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); + Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); //check that if already assigned, skips assignment and just recalls same value - //List lie3 = deconvoluter.Deconvolute(singlespec, singleRange).ToList(); - //Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); + List lie3 = Deconvoluter.Deconvolute(singlespec, deconParameters, singleRange).ToList(); + Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); } #endregion diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 040498f96..8b04d218b 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -1358,9 +1358,7 @@ public static void RealDataMbrTest() corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - // Making MBR more permissive (i.e., minimum RT Window width) results in more MBR-detections but a lower corr - // corr should increase once we introduce target/decoy and score filtering - Assert.That(corr > 0.65); + Assert.That(corr > 0.7); // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein // has to be observed in a condition for match-between-runs From afae344a6f3b055b3578400aba510b82a5750281 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 23 Apr 2024 13:49:21 -0500 Subject: [PATCH 47/55] Increased number of anchor peptides --- mzLib/FlashLFQ/FlashLfqEngine.cs | 20 ++++++++------------ mzLib/FlashLFQ/MbrScorer.cs | 9 ++++----- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 069253d58..1a62cb7b3 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -40,6 +40,7 @@ public class FlashLfqEngine public readonly double MbrRtWindow; public readonly double MbrPpmTolerance; public readonly bool RequireMsmsIdInCondition; + private int _numberOfAnchorPeptidesForMbr = 3; // the number of anchor peptides used for local alignment when predicting retention times of MBR acceptor peptides // settings for the Bayesian protein quantification engine public readonly bool BayesianProteinQuant; @@ -54,7 +55,6 @@ public class FlashLfqEngine // structures used in the FlashLFQ engine private List _spectraFileInfo; - private Stopwatch _globalStopwatch; private List _allIdentifications; /// @@ -537,9 +537,7 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec } } - // build rtDiff distribution - //var rtDifferenceDistribution = new Normal(mean: anchorPeptideRtDiffs.Median(), stddev: anchorPeptideRtDiffs.StandardDeviation()); - scorer.AddRtPredErrorDistribution(donor, anchorPeptideRtDiffs); + scorer.AddRtPredErrorDistribution(donor, anchorPeptideRtDiffs, _numberOfAnchorPeptidesForMbr); return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); } @@ -557,11 +555,9 @@ internal RtInfo PredictRetentionTime( ChromatographicPeak donorPeak, SpectraFileInfo acceptorFile, bool acceptorSampleIsFractionated, - bool donorSampleIsFractionated, - MbrScorer scorer) + bool donorSampleIsFractionated) { - var nearbyCalibrationPoints = new List(); - int numberOfAnchorsPerSide = 2; // The number of anchor peptides to be used for local alignment (on either side of the donor peptide) + var nearbyCalibrationPoints = new List(); // The number of anchor peptides to be used for local alignment (on either side of the donor peptide) // only compare +- 1 fraction if (acceptorSampleIsFractionated && donorSampleIsFractionated) @@ -602,7 +598,7 @@ internal RtInfo PredictRetentionTime( } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); numberOfForwardAnchors++; - if(numberOfForwardAnchors >= numberOfAnchorsPerSide) // We only want a handful of anchor points + if(numberOfForwardAnchors >= _numberOfAnchorPeptidesForMbr) // We only want a handful of anchor points { break; } @@ -622,7 +618,7 @@ internal RtInfo PredictRetentionTime( } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); numberOfBackwardsAnchors++; - if (numberOfBackwardsAnchors >= numberOfAnchorsPerSide) // We only want a handful of anchor points + if (numberOfBackwardsAnchors >= _numberOfAnchorPeptidesForMbr) // We only want a handful of anchor points { break; } @@ -640,7 +636,7 @@ internal RtInfo PredictRetentionTime( .ToList(); double medianRtDiff = rtDiffs.Median(); - double rtRange = rtDiffs.InterquartileRange() * 4.5; // This is roughly equivalent to 2 standard deviations + double rtRange = rtDiffs.StandardDeviation() * 6.0; // Search in the area 3 StdDevs on either side of the predicted RT rtRange = Math.Min(rtRange, MbrRtWindow); @@ -785,7 +781,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) { ChromatographicPeak donorPeak = idDonorPeaks[i]; // TODO: Add a toggle that set rtRange to be maximum width - RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated, scorer); + RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (rtInfo == null) continue; FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific); diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index c5c375fd0..8c2eea8af 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -56,7 +56,7 @@ internal MbrScorer( /// match-between-runs for the specified donor file /// /// List of retention time differences (doubles) calculated as donor file RT - acceptor file RT - internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List anchorPeptideRtDiffs) + internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List anchorPeptideRtDiffs, int numberOfAnchorPeptidesPerSide) { // in MBR, we use anchor peptides on either side of the donor to predict the retention time // here, we're going to repeat the same process, using neighboring anchor peptides to predicte the Rt shift for each @@ -64,19 +64,18 @@ internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List // then, we'll check how close our predicted rt shift was to the observed rt shift // and build a distribution based on the predicted v actual rt diffs - int numAnchorPepsPerSide = 2; // hardCoded for now, number of anchor peptides on each side of the "donor" to be considered double cumSumRtDiffs; List rtPredictionErrors = new(); - for (int i = numAnchorPepsPerSide; i < anchorPeptideRtDiffs.Count - (numAnchorPepsPerSide); i++) + for (int i = numberOfAnchorPeptidesPerSide; i < anchorPeptideRtDiffs.Count - (numberOfAnchorPeptidesPerSide) ; i++) { cumSumRtDiffs = 0; - for(int j = 1; j <= numAnchorPepsPerSide; j++) + for(int j = 1; j <= numberOfAnchorPeptidesPerSide; j++) { cumSumRtDiffs += anchorPeptideRtDiffs[i - j]; cumSumRtDiffs += anchorPeptideRtDiffs[i + j]; } - double avgDiff = cumSumRtDiffs / (2 * numAnchorPepsPerSide); + double avgDiff = cumSumRtDiffs / (2 * numberOfAnchorPeptidesPerSide); rtPredictionErrors.Add(avgDiff - anchorPeptideRtDiffs[i]); } From 6d44e98b8e58df0733f8ae909ef409becee8cf5d Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 23 Apr 2024 14:32:28 -0500 Subject: [PATCH 48/55] Changed anchors, updated test --- mzLib/FlashLFQ/FlashLfqEngine.cs | 4 +++- mzLib/TestFlashLFQ/TestFlashLFQ.cs | 3 ++- mzLib/mzLib.nuspec | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 1a62cb7b3..07428cf82 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -636,7 +636,9 @@ internal RtInfo PredictRetentionTime( .ToList(); double medianRtDiff = rtDiffs.Median(); - double rtRange = rtDiffs.StandardDeviation() * 6.0; // Search in the area 3 StdDevs on either side of the predicted RT + double rtRange = rtDiffs.InterquartileRange() * 4.5; + // IQR * 4.5 is roughly equivalent to 6 StdDevs, so search window extends ~3 std.devs from either side of predicted RT + // IQR is less affected by outliers than StdDev rtRange = Math.Min(rtRange, MbrRtWindow); diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 8b04d218b..934899098 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -1358,7 +1358,8 @@ public static void RealDataMbrTest() corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.That(corr > 0.7); + // Update means more MBR-detections, which decreases the correlation slightly. Will increase again when we begin filtering based on MBR score + Assert.Greater(corr, 0.69); // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein // has to be observed in a condition for match-between-runs diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 5d5400e95..b51a69470 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 1.0.547 + 5.103.0 mzLib Stef S. Stef S. From f08cba16c752bfcec812bcee9b44f23981f20c7a Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 23 Apr 2024 14:33:22 -0500 Subject: [PATCH 49/55] nuspec revert --- mzLib/mzLib.nuspec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index b51a69470..5d5400e95 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 5.103.0 + 1.0.547 mzLib Stef S. Stef S. From f8394f3676af0b59dd9279f22efea54d6dc250eb Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 24 Apr 2024 12:20:25 -0500 Subject: [PATCH 50/55] Addressed MRS comments, minor refactor so that scores are calculated in the ChromPeak class --- mzLib/FlashLFQ/ChromatographicPeak.cs | 124 ++++++++++++++++---------- mzLib/FlashLFQ/FlashLfqEngine.cs | 5 +- mzLib/FlashLFQ/MbrScorer.cs | 82 +++++++++-------- mzLib/TestFlashLFQ/TestFlashLFQ.cs | 1 - 4 files changed, 117 insertions(+), 95 deletions(-) diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 353909ed1..9041eab66 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -1,10 +1,9 @@ -using Chemistry; -using MathNet.Numerics.Statistics; +using MzLibUtil; using System; using System.Collections.Generic; -using System.Configuration; using System.Linq; using System.Text; +using ClassExtensions = Chemistry.ClassExtensions; namespace FlashLFQ { @@ -17,13 +16,20 @@ public class ChromatographicPeak public int ScanCount => IsotopicEnvelopes.Count; public double SplitRT; public readonly bool IsMbrPeak; - public double MbrScore; - public double PpmScore { get; set; } - public double IntensityScore { get; set; } - public double RtScore { get; set; } - public double ScanCountScore { get; set; } + public double PredictedRetentionTime { get; init; } - public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) + /// + /// A score bounded by 100 and 0, with more confident MBR-detections receiving higher scores + /// + public double MbrScore { get; private set; } + + /// The four scores below are bounded by 0 and 1, with higher scores being better + public double PpmScore { get; private set; } + public double IntensityScore { get; private set; } + public double RtScore { get; private set; } + public double ScanCountScore { get; private set; } + + public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo) { SplitRT = 0; NumChargeStatesObserved = 0; @@ -36,52 +42,18 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi SpectraFileInfo = fileInfo; } + public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, double predictedRetentionTime) : + this(id, isMbrPeak, fileInfo) + { + PredictedRetentionTime = predictedRetentionTime; + } + public IsotopicEnvelope Apex { get; private set; } public List Identifications { get; private set; } public int NumChargeStatesObserved { get; private set; } public int NumIdentificationsByBaseSeq { get; private set; } public int NumIdentificationsByFullSeq { get; private set; } public double MassError { get; private set; } - /// - /// Expected retention time for MBR acceptor peaks (mean) - /// - public double? RtHypothesis { get; private set; } - - public static string TabSeparatedHeader - { - get - { - var sb = new StringBuilder(); - sb.Append("File Name" + "\t"); - sb.Append("Base Sequence" + "\t"); - sb.Append("Full Sequence" + "\t"); - sb.Append("Protein Group" + "\t"); - sb.Append("Organism" + '\t'); - sb.Append("Peptide Monoisotopic Mass" + "\t"); - sb.Append("MS2 Retention Time" + "\t"); - sb.Append("Precursor Charge" + "\t"); - sb.Append("Theoretical MZ" + "\t"); - sb.Append("Peak intensity" + "\t"); - sb.Append("Peak RT Start" + "\t"); - sb.Append("Peak RT Apex" + "\t"); - sb.Append("Peak RT End" + "\t"); - sb.Append("Peak MZ" + "\t"); - sb.Append("Peak Charge" + "\t"); - sb.Append("Num Charge States Observed" + "\t"); - sb.Append("Peak Detection Type" + "\t"); - sb.Append("MBR Score" + "\t"); - sb.Append("Ppm Score" + "\t"); - sb.Append("Intensity Score" + "\t"); - sb.Append("Rt Score" + "\t"); - sb.Append("Scan Count Score" + "\t"); - sb.Append("PSMs Mapped" + "\t"); - sb.Append("Base Sequences Mapped" + "\t"); - sb.Append("Full Sequences Mapped" + "\t"); - sb.Append("Peak Split Valley RT" + "\t"); - sb.Append("Peak Apex Mass Error (ppm)"); - return sb.ToString(); - } - } public void CalculateIntensityForThisFeature(bool integrate) { @@ -152,6 +124,60 @@ public void ResolveIdentifications() this.NumIdentificationsByFullSeq = Identifications.Select(v => v.ModifiedSequence).Distinct().Count(); } + /// + /// Calculates four component scores and one overarching Mbr score for an MBR peak. + /// MBR Score is equal to 100 * the geometric mean of the four component scores. + /// + /// An MbrScorer specific to the file where this peak was found + /// The donor peak used as the basis for the MBR identification. + internal void CalculateMbrScore(MbrScorer scorer, ChromatographicPeak donorPeak) + { + if (SpectraFileInfo != scorer.AcceptorFile) throw new MzLibException("Error when performing match-between-runs: Mismatch between scorer and peak."); + + IntensityScore = scorer.CalculateIntensityScore(this, donorPeak); + RtScore = scorer.CalculateRetentionTimeScore(this, donorPeak); + PpmScore = scorer.CalculatePpmErrorScore(this); + ScanCountScore = scorer.CalculateScanCountScore(this); + + MbrScore = 100 * Math.Pow(IntensityScore * RtScore * PpmScore * ScanCountScore, 0.25); + } + + public static string TabSeparatedHeader + { + get + { + var sb = new StringBuilder(); + sb.Append("File Name" + "\t"); + sb.Append("Base Sequence" + "\t"); + sb.Append("Full Sequence" + "\t"); + sb.Append("Protein Group" + "\t"); + sb.Append("Organism" + '\t'); + sb.Append("Peptide Monoisotopic Mass" + "\t"); + sb.Append("MS2 Retention Time" + "\t"); + sb.Append("Precursor Charge" + "\t"); + sb.Append("Theoretical MZ" + "\t"); + sb.Append("Peak intensity" + "\t"); + sb.Append("Peak RT Start" + "\t"); + sb.Append("Peak RT Apex" + "\t"); + sb.Append("Peak RT End" + "\t"); + sb.Append("Peak MZ" + "\t"); + sb.Append("Peak Charge" + "\t"); + sb.Append("Num Charge States Observed" + "\t"); + sb.Append("Peak Detection Type" + "\t"); + sb.Append("MBR Score" + "\t"); + sb.Append("Ppm Score" + "\t"); + sb.Append("Intensity Score" + "\t"); + sb.Append("Rt Score" + "\t"); + sb.Append("Scan Count Score" + "\t"); + sb.Append("PSMs Mapped" + "\t"); + sb.Append("Base Sequences Mapped" + "\t"); + sb.Append("Full Sequences Mapped" + "\t"); + sb.Append("Peak Split Valley RT" + "\t"); + sb.Append("Peak Apex Mass Error (ppm)"); + return sb.ToString(); + } + } + public override string ToString() { StringBuilder sb = new StringBuilder(); diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 07428cf82..09cce4604 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -806,7 +806,6 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) if (samePeakSameSequence != null) { - samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; samePeakSameSequence.Identifications.Add(acceptorPeak.Identifications.First()); } else @@ -991,7 +990,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( List chargeEnvelopes) { var donorId = donorPeak.Identifications.First(); - var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, predictedRetentionTime: rtInfo.PredictedRt); // Grab the first scan/envelope from charge envelopes. This should be the most intense envelope in the list IsotopicEnvelope seedEnv = chargeEnvelopes.First(); @@ -1014,7 +1013,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return null; } - acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, rtInfo.PredictedRt); + acceptorPeak.CalculateMbrScore(scorer, donorPeak); return acceptorPeak; } diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 8c2eea8af..a703cf3b7 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -3,7 +3,6 @@ using System; using System.Collections.Generic; using System.Data; -using System.Data.Entity.ModelConfiguration.Conventions; using System.Linq; namespace FlashLFQ @@ -14,6 +13,7 @@ namespace FlashLFQ /// internal class MbrScorer { + internal SpectraFileInfo AcceptorFile { get; init; } // Intensity and ppm distributions are specific to each acceptor file private readonly Normal _logIntensityDistribution; private readonly Normal _ppmDistribution; @@ -21,10 +21,8 @@ internal class MbrScorer // The logFcDistributions and rtDifference distributions are unique to each donor file - acceptor file pair private Dictionary _logFcDistributionDictionary; private Dictionary _rtPredictionErrorDistributionDictionary; - internal Dictionary ApexToAcceptorFilePeakDict { get; } - internal List UnambiguousMsMsAcceptorPeaks { get; } - internal double MaxNumberOfScansObserved { get; } + internal List UnambiguousMsMsPeaks { get; } /// /// Takes in an intensity distribution, a log foldchange distribution, and a ppm distribution @@ -36,9 +34,9 @@ internal MbrScorer( Normal ppmDistribution, Normal logIntensityDistribution) { + AcceptorFile = acceptorPeaks.First().SpectraFileInfo; ApexToAcceptorFilePeakDict = apexToAcceptorFilePeakDict; - UnambiguousMsMsAcceptorPeaks = acceptorPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); - MaxNumberOfScansObserved = acceptorPeaks.Max(peak => peak.ScanCount); + UnambiguousMsMsPeaks = acceptorPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; _logFcDistributionDictionary = new(); @@ -47,6 +45,7 @@ internal MbrScorer( // This is kludgey, because scan counts are discrete List scanList = acceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); // build a normal distribution for the scan list of the acceptor peaks + // InterQuartileRange / 1.35 = StandardDeviation for a normal distribution _scanCountDistribution = new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36); } @@ -91,48 +90,16 @@ internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List _rtPredictionErrorDistributionDictionary.Add(donorFile, new Normal(medianRtError, stdDevRtError)); } - /// - /// Get the RT window width for a given donor file, - /// where RT window width is equal to 4*stdDev of the rtDiffs for all anchor peptides - /// - /// The width of the retention time window in minutes - internal double GetRTWindowWidth(SpectraFileInfo donorFile) - { - // 95% of all peaks are expected to fall within six standard deviations - return _rtPredictionErrorDistributionDictionary[donorFile].StdDev * 4; - } - - internal double GetMedianRtDiff(SpectraFileInfo donorFile) - { - return _rtPredictionErrorDistributionDictionary[donorFile].Median; - } - - /// - /// Scores a MBR peak based on it's retention time, ppm error, and intensity - /// - /// An MBR Score ranging between 0 and 100. Higher scores are better. - internal double ScoreMbr(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak, double predictedRt) - { - acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); - acceptorPeak.RtScore = CalculateScore(_rtPredictionErrorDistributionDictionary[donorPeak.SpectraFileInfo], - predictedRt - acceptorPeak.ApexRetentionTime); - acceptorPeak.PpmScore = CalculateScore(_ppmDistribution, acceptorPeak.MassError); - acceptorPeak.ScanCountScore = CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); - - // Returns 100 times the geometric mean of the four scores - return 100 * Math.Pow( acceptorPeak.IntensityScore * acceptorPeak.RtScore * acceptorPeak.PpmScore * acceptorPeak.ScanCountScore, 0.25); - } - - internal double CalculateScore(Normal distribution, double value) + private double CalculateScore(Normal distribution, double value) { - // new method double absoluteDiffFromMean = Math.Abs(distribution.Mean - value); // Returns a value between (0, 1] where 1 means the value was equal to the distribution mean return 2 * distribution.CumulativeDistribution(distribution.Mean - absoluteDiffFromMean); } - internal double CalculateIntensityScore(double acceptorIntensity, ChromatographicPeak donorPeak) + internal double CalculateIntensityScore(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak) { + double acceptorIntensity = acceptorPeak.Intensity; if (donorPeak != null && acceptorIntensity != 0 && donorPeak.Intensity != 0 && _logFcDistributionDictionary.TryGetValue(donorPeak.SpectraFileInfo, out var logFcDistribution)) { @@ -144,7 +111,38 @@ internal double CalculateIntensityScore(double acceptorIntensity, Chromatographi var logIntensity = Math.Log(acceptorIntensity, 2); return CalculateScore(_logIntensityDistribution, logIntensity); } + } + + /// + /// Calculates the retention time score for a given MbrAcceptor by comparing to the + /// distribution of all retention time prediction errors for all anchor peptides shared between + /// the donor and acceptor files + /// + /// Score bounded by 0 and 1, where higher scores are better + internal double CalculateRetentionTimeScore(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak) + { + double rtPredictionError = acceptorPeak.PredictedRetentionTime - acceptorPeak.ApexRetentionTime; + return CalculateScore(_rtPredictionErrorDistributionDictionary[donorPeak.SpectraFileInfo], rtPredictionError); + } + + /// + /// Calculates the Ppm error score for a given acceptor by comparing the ppm error for the given peak + /// to the ppm error of all non-MBR peaks in the acceptor file + /// + /// Score bounded by 0 and 1, where higher scores are better + internal double CalculatePpmErrorScore(ChromatographicPeak acceptorPeak) + { + return CalculateScore(_ppmDistribution, acceptorPeak.MassError); + } + /// + /// Calculates the scan count score for a given acceptor by comparing the number of scans observed for the given peak + /// to the ppm error of all non-MBR peaks in the acceptor file + /// + /// Score bounded by 0 and 1, where higher scores are better + internal double CalculateScanCountScore(ChromatographicPeak acceptorPeak) + { + return CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); } /// @@ -164,7 +162,7 @@ internal void CalculateFoldChangeBetweenFiles(List idDonorP var acceptorFileBestMsmsPeaks = new Dictionary(); // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in UnambiguousMsMsAcceptorPeaks) + foreach (ChromatographicPeak acceptorPeak in UnambiguousMsMsPeaks) { if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) { diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 934899098..f2b550651 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -586,7 +586,6 @@ public static void TestFlashLfqMatchBetweenRuns() Assert.That(results.Peaks[file1].Count == 5); Assert.That(!results.Peaks[file1].Any(p => p.IsMbrPeak)); - Assert.That(!results.Peaks[file1].Any(p => p.RtHypothesis.HasValue)); results = interquartileEngine.Run(); peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); From 2c00ebcda380d2744d8d8362108427542920fda2 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 24 Apr 2024 12:21:37 -0500 Subject: [PATCH 51/55] minor --- mzLib/Test/Test.csproj | 1 - 1 file changed, 1 deletion(-) diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 89911cc3e..7b5882009 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -13,7 +13,6 @@ - From 4bcdc0dea400ae9d6446423b0bf68212f065d8b7 Mon Sep 17 00:00:00 2001 From: RayMSMS Date: Wed, 12 Jun 2024 12:38:43 -0500 Subject: [PATCH 52/55] first commit --- .gitignore | 1 + mzLib/FlashLFQ/Alex project/XIC.cs | 129 +++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 mzLib/FlashLFQ/Alex project/XIC.cs diff --git a/.gitignore b/.gitignore index b33850435..cf0c9f511 100644 --- a/.gitignore +++ b/.gitignore @@ -250,3 +250,4 @@ ModelManifest.xml # Macintosh files **/.DS_Store +/mzLib/TestFlashLFQ/XicTest.cs diff --git a/mzLib/FlashLFQ/Alex project/XIC.cs b/mzLib/FlashLFQ/Alex project/XIC.cs new file mode 100644 index 000000000..58b7619aa --- /dev/null +++ b/mzLib/FlashLFQ/Alex project/XIC.cs @@ -0,0 +1,129 @@ +using MathNet.Numerics.Interpolation; +using MathNet.Numerics.Providers.LinearAlgebra; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using MathNet.Numerics.IntegralTransforms; +using System.Runtime.CompilerServices; +using System.Numerics; +using Omics.Fragmentation; + +namespace FlashLFQ.Alex_project +{ + public class XIC + { + public List Ms1Peaks { get; init; } + public double PeakFindingMz { get; init; } + public SpectraFileInfo SpectraFile { get; init; } + public bool Reference { get; init; } + public MathNet.Numerics.Interpolation.LinearSpline LinearSpline { get; private set;} + public double RtShift { get; private set; } + + public XIC(List peaks, double peakFindingMass, SpectraFileInfo spectraFile) + { + Ms1Peaks = peaks; + PeakFindingMz = peakFindingMass; + SpectraFile = spectraFile; + } + + private List PadPeaks() + + { + var paddedPeaks = new List(); + var firstPeak = Ms1Peaks[0]; + var lastPeak = Ms1Peaks[Ms1Peaks.Count - 1]; + double gap = (lastPeak.RetentionTime - firstPeak.RetentionTime) / (Ms1Peaks.Count-1); + + // because we hope to have an odd number of peaks, we have to add the odd number padded peaks + + + + for (int i = 5; i > 0; i--) //add 4 peaks before the first peak + { + paddedPeaks.Add(new IndexedMassSpectralPeak(0, 0, 0, firstPeak.RetentionTime - gap * i)); // not sure about the m/z and index + } + + for (int i = 0; i < Ms1Peaks.Count; i++) + { + paddedPeaks.Add(Ms1Peaks[i]); + } + + for (int i = 1; i < 6; i++) //add 5 peaks after the last peak + { + paddedPeaks.Add(new IndexedMassSpectralPeak(0, 0, 0, lastPeak.RetentionTime + gap * i)); + } + + + + return paddedPeaks; + + } + + internal void BulidLinearSpline() + { + double[] x = PadPeaks().Select(p => p.RetentionTime).ToArray(); + double[] y = PadPeaks().Select(p => p.Intensity).ToArray(); + this.LinearSpline = LinearSpline.InterpolateSorted(x, y); // I am not sure what to put in the last parameter + } + + + /// + /// Aligns two XICs and reports the relative shift in the time using Fast Fourier Transform. + /// This function performs better if the XIC contains signal from before and after the last chromatographic peak of the interest (i.em longer XICs are better). + /// Alignment will fail if the magnitude of the RT shift is greater then 1/4 the RT span of either XIC. + /// The XICs are up-sampled to allow for sub-pixel resolution. (one Peaks datapoint = one pixel). + /// @return rtShift: The times shift that needed to move to align to the referenceXICs. Positive values indicate the mins to move forward, negative indicate the mins to move backward. + /// + + public static double AlignXICs(XIC referenceXic, XIC xicToAlign, int resolution = 100) + { + + + referenceXic.BulidLinearSpline(); + xicToAlign.BulidLinearSpline(); + var referSpline = referenceXic.LinearSpline; + var toAlignSpline = xicToAlign.LinearSpline; + + double timegap = (referenceXic.Ms1Peaks.Last().RetentionTime - referenceXic.Ms1Peaks[0].RetentionTime) / (referenceXic.Ms1Peaks.Count - 1); + double initialTime = referenceXic.Ms1Peaks[0].RetentionTime-5.0*timegap; //after the padding, the first peak move ahead 5 timegap + double FinalTime = referenceXic.Ms1Peaks.Last().RetentionTime+5.0*timegap; //after the padding, the last peak move back 5 timegap + double time = initialTime; + + // create two arrays to store the interpolated values of the two XICs + Complex[] reference = new Complex[(int)((FinalTime - initialTime)* resolution + 2)]; + Complex[] toAlign = new Complex[(int)((FinalTime - initialTime) * resolution + 2)]; + int index = 0; + while (time < FinalTime) + { + reference[index] = new Complex(referSpline.Interpolate(time), 0); + toAlign[index] = new Complex(toAlignSpline.Interpolate(time), 0); + time += (1.0 / resolution); + index++; + } + + Fourier.Forward(reference); + Fourier.Forward(toAlign); + Complex[] product = new Complex[reference.Length]; + for (int i = 0; i < reference.Length; i++) + { + product[i] = reference[i] * Complex.Conjugate(toAlign[i]); //element-wise multiplication + } + Fourier.Inverse(product); + + for (int i = 0; i < product.Length / 2 ; i++) //swap the first half and the second half of the array, ex the timeLine(0,2pi) -> (-pi,pi) + { + Complex temp = product[i]; + product[i]= product[product.Length / 2+i]; + product[product.Length / 2+i] = temp; + } + + double maxMagnitude = product.Max(p => p.Magnitude); + int indexForTheMaxValue = Array.FindIndex(product, p => p.Magnitude == maxMagnitude); + double rtShift = -(product.Length/2 - indexForTheMaxValue) * (1.0 / resolution); + return rtShift; + } + + } +} From 618f8c4f6e6f125228c5469e4d6dff565a24030b Mon Sep 17 00:00:00 2001 From: RayMSMS Date: Wed, 12 Jun 2024 12:40:11 -0500 Subject: [PATCH 53/55] example commit --- mzLib/FlashLFQ/Alex project/XIC.cs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mzLib/FlashLFQ/Alex project/XIC.cs b/mzLib/FlashLFQ/Alex project/XIC.cs index 58b7619aa..21076f8e7 100644 --- a/mzLib/FlashLFQ/Alex project/XIC.cs +++ b/mzLib/FlashLFQ/Alex project/XIC.cs @@ -123,6 +123,8 @@ public static double AlignXICs(XIC referenceXic, XIC xicToAlign, int resolution int indexForTheMaxValue = Array.FindIndex(product, p => p.Magnitude == maxMagnitude); double rtShift = -(product.Length/2 - indexForTheMaxValue) * (1.0 / resolution); return rtShift; + + // Example } } From 1da1161e12e0be6e27d5a3f673d46d8d80ef35d4 Mon Sep 17 00:00:00 2001 From: RayMSMS Date: Mon, 15 Jul 2024 14:44:33 -0500 Subject: [PATCH 54/55] update 7/15/2024 1. Create the new class "Extremum" to store the local extre number 2. More XIC method, BuildSmoothedCubicSpline and "FindExtrema" --- mzLib/FlashLFQ/Alex project/Extremum.cs | 76 +++++++++++++++++++++++++ mzLib/FlashLFQ/Alex project/XIC.cs | 73 ++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 mzLib/FlashLFQ/Alex project/Extremum.cs diff --git a/mzLib/FlashLFQ/Alex project/Extremum.cs b/mzLib/FlashLFQ/Alex project/Extremum.cs new file mode 100644 index 000000000..3c2551b95 --- /dev/null +++ b/mzLib/FlashLFQ/Alex project/Extremum.cs @@ -0,0 +1,76 @@ +using MzLibUtil; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ.Alex_project +{ + + public enum ExtremumType { Minimum, Maximum }; //The spectra file the XIC came from + + internal class Extremum : IComparable, IEquatable + { + public readonly double Intensity; //The aligned intensity of the Extremum point + public readonly double RetentionTime; //The interpolated intensity of the Extremum point + public readonly ExtremumType Type; //The type of the Extremum point + + public Extremum(double intensity, double retentionTime, ExtremumType type) + { + Intensity = intensity; + RetentionTime = retentionTime; + Type = type; + } + + + + + public int CompareTo(Extremum others) + { + if (others == null) + { + return 1; + } + + if (this - others > 0) + { + return 1; + } + + else if (this - others < 0) + { + return -1; + } + + else + { + return 0; + } + + } + + public bool Equals(Extremum others) + { + if (this.Intensity == others.Intensity && Math.Abs(this - others) < 0.006 && this.Type == others.Type) + { + return true; + } + + return false; + } + + public override bool Equals(Object obj) + { + return Equals(this, (Extremum)obj); + } + + public static double operator - (Extremum extremun1, Extremum extrenum2) + { + double rtDiff = extremun1.RetentionTime - extrenum2.RetentionTime; + return rtDiff; + } + + + } +} diff --git a/mzLib/FlashLFQ/Alex project/XIC.cs b/mzLib/FlashLFQ/Alex project/XIC.cs index 21076f8e7..a8ed8ee68 100644 --- a/mzLib/FlashLFQ/Alex project/XIC.cs +++ b/mzLib/FlashLFQ/Alex project/XIC.cs @@ -9,6 +9,9 @@ using System.Runtime.CompilerServices; using System.Numerics; using Omics.Fragmentation; +using Readers.Generated; +using System.Threading; +using Easy.Common.Extensions; namespace FlashLFQ.Alex_project { @@ -19,7 +22,9 @@ public class XIC public SpectraFileInfo SpectraFile { get; init; } public bool Reference { get; init; } public MathNet.Numerics.Interpolation.LinearSpline LinearSpline { get; private set;} + public CubicSpline SmoothedCubicSpline { get; private set; } public double RtShift { get; private set; } + List Extrema { get; set; } public XIC(List peaks, double peakFindingMass, SpectraFileInfo spectraFile) { @@ -127,5 +132,73 @@ public static double AlignXICs(XIC referenceXic, XIC xicToAlign, int resolution // Example } + /// + /// Try to smooth the XIC by averaging the intensity of the points + /// + /// should be odds number. The number of points to average for smoothing the XIC + /// + public void BuildSmoothedCubicSpline(int pointsToAverage = 3) + { + if (pointsToAverage <= 0) + { + throw new ArgumentException("pointsToAverage must be greater than 0"); + } + double[] intensity = Ms1Peaks.Select(p => p.Intensity).ToArray(); + double[] retentionTime = Ms1Peaks.Select(p => p.RetentionTime + RtShift).ToArray(); + double[] smoothedIntensity = new double[intensity.Length]; + + for (int i = 0; i < intensity.Length; i++) //smooth the intensity + { + if (i < pointsToAverage / 2) + { + smoothedIntensity[i] = 0; + } + + else if (i >= intensity.Length - pointsToAverage / 2) + { + smoothedIntensity[i] = 0; + } + + else + { + double sum = 0; + for (int j = 0; j < pointsToAverage; j++) + { + sum += intensity[i + j - pointsToAverage / 2]; + } + smoothedIntensity[i] = sum / pointsToAverage; + } + + } + + this.SmoothedCubicSpline = CubicSpline.InterpolateAkima(retentionTime, smoothedIntensity); + } + + + /// + /// Use the second derivative to find the local maxmun and minimum points + /// + public void FindExtrema() + { + Extrema = new List(); + double[] extrePoints = SmoothedCubicSpline.StationaryPoints(); //extrePoint is the retentionTime for the point's first derivative is zero + foreach (var point in extrePoints) + { + if (SmoothedCubicSpline.Differentiate2(point) > 0) //Local Maxmun point + { + Extrema.Add(new Extremum(Ms1Peaks.Where(p => p.RetentionTime == point).First().Intensity, point, ExtremumType.Maximum)); + Extrema.Sort(); + } + if (SmoothedCubicSpline.Differentiate2(point) > 0) //Local Minmun point + { + Extrema.Add(new Extremum(Ms1Peaks.Where(p => p.RetentionTime == point).First().Intensity, point, ExtremumType.Minimum)); + Extrema.Sort(); + } + + } + + + } + } } From aa37dda72b62d91a725d1c87e477e1615774c76d Mon Sep 17 00:00:00 2001 From: RayMSMS <150720362+RayMSMS@users.noreply.github.com> Date: Tue, 24 Sep 2024 11:50:29 -0500 Subject: [PATCH 55/55] update the flashLFQ 1. Creat a new situation "IsobaricCase" in MBR quantity 2. Create the private method "BuildXIC", "CollectPeakInDifferent", "BuildChrom" 3. new property "isobaricCase", "isobaricDict" --- mzLib/FlashLFQ/Alex project/Extremum.cs | 4 +- mzLib/FlashLFQ/Alex project/ParallelSearch.cs | 136 +++++++++ mzLib/FlashLFQ/Alex project/XIC.cs | 56 ++-- mzLib/FlashLFQ/Alex project/XICGroups.cs | 232 +++++++++++++++ mzLib/FlashLFQ/FlashLFQ.csproj | 2 + mzLib/FlashLFQ/FlashLfqEngine.cs | 231 ++++++++++++++- mzLib/FlashLFQ/Parameter.cs | 19 ++ mzLib/FlashLFQ/PeakIndexingEngine.cs | 12 +- mzLib/Test/FileReadingTests/TestMzML.cs | 92 ++++++ .../FileReadingTests/TestRawFileReader.cs | 263 ++++++++++++++++++ mzLib/TestFlashLFQ/TestFlashLFQ.cs | 2 +- mzLib/TestFlashLFQ/TestFlashLFQ.csproj | 3 + mzLib/TestFlashLFQ/ThreadingTest.cs | 115 ++++++++ 13 files changed, 1134 insertions(+), 33 deletions(-) create mode 100644 mzLib/FlashLFQ/Alex project/ParallelSearch.cs create mode 100644 mzLib/FlashLFQ/Alex project/XICGroups.cs create mode 100644 mzLib/FlashLFQ/Parameter.cs create mode 100644 mzLib/TestFlashLFQ/ThreadingTest.cs diff --git a/mzLib/FlashLFQ/Alex project/Extremum.cs b/mzLib/FlashLFQ/Alex project/Extremum.cs index 3c2551b95..5ce71803e 100644 --- a/mzLib/FlashLFQ/Alex project/Extremum.cs +++ b/mzLib/FlashLFQ/Alex project/Extremum.cs @@ -10,7 +10,7 @@ namespace FlashLFQ.Alex_project public enum ExtremumType { Minimum, Maximum }; //The spectra file the XIC came from - internal class Extremum : IComparable, IEquatable + public class Extremum : IComparable, IEquatable { public readonly double Intensity; //The aligned intensity of the Extremum point public readonly double RetentionTime; //The interpolated intensity of the Extremum point @@ -71,6 +71,8 @@ public override bool Equals(Object obj) return rtDiff; } + + } } diff --git a/mzLib/FlashLFQ/Alex project/ParallelSearch.cs b/mzLib/FlashLFQ/Alex project/ParallelSearch.cs new file mode 100644 index 000000000..266c56034 --- /dev/null +++ b/mzLib/FlashLFQ/Alex project/ParallelSearch.cs @@ -0,0 +1,136 @@ +using Easy.Common.Extensions; +using SharpLearning.InputOutput.Csv; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Runtime.ExceptionServices; +using System.Runtime.InteropServices; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using Plotly.NET; +using Plotly.NET.CSharp; +using Chart = Plotly.NET.CSharp.Chart; +using GenericChartExtensions = Plotly.NET.CSharp.GenericChartExtensions; +using System; +using System.Collections.Generic; +using System.Linq; +using Plotly.NET.LayoutObjects; +using Plotly.NET.TraceObjects; +using System.ComponentModel; +using System.Drawing; +using Color = Plotly.NET.Color; + + + +namespace FlashLFQ.Alex_project +{ + public class ParallelSearch + { + + readonly int threadNumber = Environment.ProcessorCount-1; + readonly int[] threads; + Dictionary xicDict; + XICGroups[] xicGroups; + + public ParallelSearch(Dictionary xicDict) + { + threads = Enumerable.Range(0, threadNumber).ToArray(); + this.xicDict = xicDict; + xicGroups = new XICGroups[threadNumber]; + } + + public void run() + { + Parallel.ForEach(threads, (currentThread) => + { + List xics = GroupedXIC(xicDict, currentThread); + if (xics.Count() != 0) + { + xicGroups[currentThread] = new XICGroups(xics, 0.5, 0.1); + draw(currentThread, xicGroups[currentThread]); + } + + }); + + Console.WriteLine("The total sum is "); + + } + + /// + /// Try to group the XICs from a Big XIC data set. If we met the reference XIC, we will build the XICGroups. + /// + /// + public static List GroupedXIC(Dictionary xicDict, int thread) + { + Dictionary xicsToGroup = new Dictionary(); + + lock (xicDict) + { + foreach (var xic in xicDict) + { + if (xic.Value.Reference == true && xicsToGroup.Where(p => p.Value.Reference).Count() > 0) + { + break; + } + + else + { + xicsToGroup.Add(xic.Key, xic.Value); + xicDict.Remove(xic.Key); + } + } + } + + return xicsToGroup.Select(p => p.Value).ToList(); + + } + + public static void draw(int currentThread, XICGroups xicGroup) + { + XIC referenceXIC = xicGroup.reference; + + var plotStack = Chart.Combine( + xicGroup.XICs.Select(xic => + Chart.Scatter( + xic.Ms1Peaks.Select(p => p.RetentionTime).ToArray(), + xic.Ms1Peaks.Select(p => p.Intensity).ToArray(), + StyleParam.Mode.Lines_Markers, + Name: "No" + currentThread + ) + ).ToArray() + + ) + .WithTitle("XIC_differentTime") + .WithXAxisStyle(Title: Title.init("Times")) + .WithYAxisStyle(Title: Title.init("Intensities")) + .WithSize(1200, 400); + + + var plot_extrema = Chart.Combine(new[] + { + Chart.Scatter(referenceXIC.Ms1Peaks.Select(P=>P.RetentionTime), referenceXIC.Ms1Peaks.Select(P=>P.Intensity), + StyleParam.Mode.Lines_Markers, MarkerSymbol: StyleParam.MarkerSymbol.Circle, Name: "refernece"), + Chart.Scatter(xicGroup.ExtremaInRef.Select(p=> + p.Key), xicGroup.ExtremaInRef.Select(p=> p.Value), + StyleParam.Mode.Markers, MarkerSymbol: StyleParam.MarkerSymbol.Star, Name: "shared Extrema").WithMarkerStyle(Size: 15), + }) + + .WithTitle("XIC_sharedExtrema") + .WithXAxisStyle(Title: Title.init("Times")) + .WithYAxisStyle(Title: Title.init("Intensities")) + .WithSize(1200, 400); + + + var stack = Chart.Grid(new[] { plotStack, plot_extrema }, 2, 1) + .WithSize(1200, 800); + + GenericChartExtensions.Show(stack); + + } + + } +} diff --git a/mzLib/FlashLFQ/Alex project/XIC.cs b/mzLib/FlashLFQ/Alex project/XIC.cs index a8ed8ee68..0540d9a8c 100644 --- a/mzLib/FlashLFQ/Alex project/XIC.cs +++ b/mzLib/FlashLFQ/Alex project/XIC.cs @@ -12,25 +12,30 @@ using Readers.Generated; using System.Threading; using Easy.Common.Extensions; +using System.Timers; +using CsvHelper.Configuration.Attributes; +using Chemistry; namespace FlashLFQ.Alex_project { public class XIC { - public List Ms1Peaks { get; init; } + public List Ms1Peaks { get; init; } public double PeakFindingMz { get; init; } public SpectraFileInfo SpectraFile { get; init; } public bool Reference { get; init; } public MathNet.Numerics.Interpolation.LinearSpline LinearSpline { get; private set;} public CubicSpline SmoothedCubicSpline { get; private set; } public double RtShift { get; private set; } - List Extrema { get; set; } + public List Extrema { get; set; } - public XIC(List peaks, double peakFindingMass, SpectraFileInfo spectraFile) + public XIC(List peaks, double peakFindingMass, SpectraFileInfo spectraFile, bool Isreference = false) { Ms1Peaks = peaks; PeakFindingMz = peakFindingMass; SpectraFile = spectraFile; + Reference = Isreference; + BulidLinearSpline(); } private List PadPeaks() @@ -74,26 +79,20 @@ internal void BulidLinearSpline() } - /// - /// Aligns two XICs and reports the relative shift in the time using Fast Fourier Transform. - /// This function performs better if the XIC contains signal from before and after the last chromatographic peak of the interest (i.em longer XICs are better). - /// Alignment will fail if the magnitude of the RT shift is greater then 1/4 the RT span of either XIC. - /// The XICs are up-sampled to allow for sub-pixel resolution. (one Peaks datapoint = one pixel). - /// @return rtShift: The times shift that needed to move to align to the referenceXICs. Positive values indicate the mins to move forward, negative indicate the mins to move backward. + /// + /// calculate the retention time shift between two XICs. Then store the value in the RtShift property /// - - public static double AlignXICs(XIC referenceXic, XIC xicToAlign, int resolution = 100) + /// The reference XIC + /// The number of the timepoint + /// The retention to shift to align to the reference + public double AlignXICs(XIC referenceXIC, int resolution = 100) { + var referSpline = referenceXIC.LinearSpline; + var toAlignSpline = this.LinearSpline; - - referenceXic.BulidLinearSpline(); - xicToAlign.BulidLinearSpline(); - var referSpline = referenceXic.LinearSpline; - var toAlignSpline = xicToAlign.LinearSpline; - - double timegap = (referenceXic.Ms1Peaks.Last().RetentionTime - referenceXic.Ms1Peaks[0].RetentionTime) / (referenceXic.Ms1Peaks.Count - 1); - double initialTime = referenceXic.Ms1Peaks[0].RetentionTime-5.0*timegap; //after the padding, the first peak move ahead 5 timegap - double FinalTime = referenceXic.Ms1Peaks.Last().RetentionTime+5.0*timegap; //after the padding, the last peak move back 5 timegap + double timegap = (this.Ms1Peaks.Last().RetentionTime - this.Ms1Peaks[0].RetentionTime) / (this.Ms1Peaks.Count - 1); + double initialTime = this.Ms1Peaks[0].RetentionTime-5.0*timegap; //after the padding, the first peak move ahead 5 timegap + double FinalTime = this.Ms1Peaks.Last().RetentionTime+5.0*timegap; //after the padding, the last peak move back 5 timegap double time = initialTime; // create two arrays to store the interpolated values of the two XICs @@ -127,6 +126,7 @@ public static double AlignXICs(XIC referenceXic, XIC xicToAlign, int resolution double maxMagnitude = product.Max(p => p.Magnitude); int indexForTheMaxValue = Array.FindIndex(product, p => p.Magnitude == maxMagnitude); double rtShift = -(product.Length/2 - indexForTheMaxValue) * (1.0 / resolution); + RtShift = rtShift; return rtShift; // Example @@ -143,6 +143,7 @@ public void BuildSmoothedCubicSpline(int pointsToAverage = 3) { throw new ArgumentException("pointsToAverage must be greater than 0"); } + double[] intensity = Ms1Peaks.Select(p => p.Intensity).ToArray(); double[] retentionTime = Ms1Peaks.Select(p => p.RetentionTime + RtShift).ToArray(); double[] smoothedIntensity = new double[intensity.Length]; @@ -171,7 +172,7 @@ public void BuildSmoothedCubicSpline(int pointsToAverage = 3) } - this.SmoothedCubicSpline = CubicSpline.InterpolateAkima(retentionTime, smoothedIntensity); + this.SmoothedCubicSpline = CubicSpline.InterpolateAkimaSorted(retentionTime, smoothedIntensity); } @@ -184,21 +185,22 @@ public void FindExtrema() double[] extrePoints = SmoothedCubicSpline.StationaryPoints(); //extrePoint is the retentionTime for the point's first derivative is zero foreach (var point in extrePoints) { - if (SmoothedCubicSpline.Differentiate2(point) > 0) //Local Maxmun point + if (SmoothedCubicSpline.Differentiate2(point) < 0) //Local Maximun point { - Extrema.Add(new Extremum(Ms1Peaks.Where(p => p.RetentionTime == point).First().Intensity, point, ExtremumType.Maximum)); - Extrema.Sort(); + var intensity_atPoint = LinearSpline.Interpolate(point - RtShift); + Extrema.Add(new Extremum(intensity_atPoint, point, ExtremumType.Maximum)); } if (SmoothedCubicSpline.Differentiate2(point) > 0) //Local Minmun point { - Extrema.Add(new Extremum(Ms1Peaks.Where(p => p.RetentionTime == point).First().Intensity, point, ExtremumType.Minimum)); - Extrema.Sort(); + var intensity_atPoint = LinearSpline.Interpolate(point - RtShift); + Extrema.Add(new Extremum(intensity_atPoint, point, ExtremumType.Minimum)); } - } + Extrema.Sort(); } + } } diff --git a/mzLib/FlashLFQ/Alex project/XICGroups.cs b/mzLib/FlashLFQ/Alex project/XICGroups.cs new file mode 100644 index 000000000..3a0989a5d --- /dev/null +++ b/mzLib/FlashLFQ/Alex project/XICGroups.cs @@ -0,0 +1,232 @@ +using Easy.Common.Extensions; +using MzLibUtil; +using OpenMcdf; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; + +namespace FlashLFQ.Alex_project +{ + public class XICGroups : IEnumerable + { + public XIC reference; + public List XICs; + public Dictionary RTDict; + public List sharedExtrema; + public Dictionary ExtremaInRef; // the shared extrema in the reference XIC (time/intensity) + readonly List ids; + public readonly Dictionary> indexedPeaks; + + /// + /// Build the XIC groups with the reference XIC + /// + /// The xICs list + /// The minmun number to find the shared extrema + /// The tolerance window to pick up shared exterma + public XICGroups(List xICs, double extremaThreshold = 0.5, double tolerance = 0.05) + { + XICs = xICs; + //this.ids = ids; + reference = XICs.Where(p => p.Reference == true).First(); // set up the XIC reference + RTDict = new Dictionary(); // build a dictionary to store the retention time shift of each XIC + + int xicID = 0; + foreach (var xic in XICs) + { + RTDict.Add(xicID, xic.AlignXICs(reference)); + xic.BuildSmoothedCubicSpline(); + xic.FindExtrema(); + xicID++; + } + + sharedExtrema = new List(); + buildSharedExtrema_2(extremaThreshold, tolerance); // find the sharedExtrema + indexedPeaks = buildIndexedPeaks(); // build the indexed peaks + sharedExtremaInRef(reference, sharedExtrema); + } + + /// + /// Build the XICGroups, default extremaCutoff is 50% , default tolerance is 0.05 min + /// + /// + public XICGroups(List xICs) + { + XICs = xICs; + reference = XICs.Where(p => p.Reference == true).First(); + RTDict = new Dictionary(); + + int xicID = 0; + foreach (var xic in XICs) + { + RTDict.Add(xicID, xic.AlignXICs(reference)); + xic.BuildSmoothedCubicSpline(); + xic.FindExtrema(); + xicID++; + } + + sharedExtrema = new List(); + buildSharedExtrema_2(0.5, 0.05); //default value, at least 50% of the XICs share the extrema + indexedPeaks = buildIndexedPeaks(); // build the indexed peaks + sharedExtremaInRef(reference, sharedExtrema); + } + + + + public void buildSharedExtrema(double count_threshold, double tolerance) + { + + foreach (var ref_extremum in reference.Extrema) + { + + int extremum_account = 0; + + foreach (var xic in XICs) + { + foreach (var xic_extremum in xic.Extrema) + { + if (within(xic_extremum, ref_extremum, tolerance) && xic_extremum.Type == ref_extremum.Type) + { + extremum_account++; + break; + } + } + + } + + if (extremum_account >= count_threshold * XICs.Count()) // if the extremum is shared by more than 50% of the XICs, then we accept + { + sharedExtrema.Add(ref_extremum); + } + + } + + } + + /// + /// Check the extremum is within the local maxmun window + /// + /// The extrema that want to compare + /// The local maximum in the reference + /// True: the extremun is within the window, False: the extremun is not within the window + private bool within(Extremum item, Extremum local_Max, double tolerance = 0.05) + {; + + double leftWindow = (local_Max.RetentionTime - tolerance >= 0) ? local_Max.RetentionTime - tolerance : 0; + + + double rightWindow = (local_Max.RetentionTime + tolerance <= reference.Ms1Peaks.Last().RetentionTime) + ? local_Max.RetentionTime + tolerance : reference.Ms1Peaks.Last().RetentionTime; + + + if (item.RetentionTime >= leftWindow && item.RetentionTime <= rightWindow && item.Type == local_Max.Type) + { + return true; + } + + return false; + } + + + public void buildSharedExtrema_2(double count_threshold = 0.5, double tolerance = 0.1) + { + List extremaList = new List(); + foreach (var xic in XICs) + { + extremaList.AddRange(xic.Extrema); + } + extremaList.Sort((p1,p2) => p1.RetentionTime.CompareTo(p2.RetentionTime)); + + + int index = 0; + Dictionary> group = new Dictionary>(); + + while (index < extremaList.Count()-1) + { + var currentExtremum = extremaList[index]; + List currentGroup = new List(); + currentGroup.Add(currentExtremum); + + for (int i = index+1; i < extremaList.Count() ; i++) + { + double timeDiff = extremaList[i].RetentionTime - currentExtremum.RetentionTime; + index = i; + + if (timeDiff > tolerance) + { + break; + } + + else + { + if (extremaList[i].Type == currentExtremum.Type) + { + currentGroup.Add(extremaList[i]); + } + } + } + + if (group.ContainsKey(currentExtremum)) + { + continue; + } + + group.Add(currentExtremum, currentGroup); + + } + + sharedExtrema = group.Where(p => p.Value.Count() >= count_threshold * XICs.Count()) + .Select(p=>p.Key).ToList(); + + } + + public void sharedExtremaInRef(XIC reference, List sharedExtre) + { + ExtremaInRef = new Dictionary(); + foreach (var extre in sharedExtre) + { + double extreTime = extre.RetentionTime; + double extreIntensity = reference.SmoothedCubicSpline.Interpolate(extreTime); + ExtremaInRef.Add(extreTime, extreIntensity); + } + } + + public Dictionary> buildIndexedPeaks() + { + var indexedPeaks = new Dictionary>(); + + foreach (var extremPoint in sharedExtrema.Where(p=>p.Type == ExtremumType.Maximum)) + { + int index_Peak = sharedExtrema.IndexOf(extremPoint); + int index_left = sharedExtrema + .Where(p => p.Type == ExtremumType.Minimum && p.RetentionTime < extremPoint.RetentionTime) + .Select(p => sharedExtrema.IndexOf(p)) + .LastOrDefault(); + + int index_right = sharedExtrema + .Where(p => p.Type == ExtremumType.Minimum && p.RetentionTime > extremPoint.RetentionTime) + .Select(p => sharedExtrema.IndexOf(p)) + .FirstOrDefault(); + + if(index_right == 0) index_right = sharedExtrema.Count() - 1; + + indexedPeaks.Add(extremPoint.RetentionTime, new Tuple (sharedExtrema[index_left].RetentionTime, sharedExtrema[index_right].RetentionTime)); + } + return indexedPeaks; + } + + + public IEnumerator GetEnumerator() + { + throw new NotImplementedException(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + throw new NotImplementedException(); + } + } +} diff --git a/mzLib/FlashLFQ/FlashLFQ.csproj b/mzLib/FlashLFQ/FlashLFQ.csproj index 52d33530e..ca3714d03 100644 --- a/mzLib/FlashLFQ/FlashLFQ.csproj +++ b/mzLib/FlashLFQ/FlashLFQ.csproj @@ -13,6 +13,8 @@ + + diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 09cce4604..1687d1bf6 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -13,6 +13,15 @@ using System.Runtime.CompilerServices; using System.IO; using Easy.Common.Extensions; +using pepXML.Generated; +using FlashLFQ.Alex_project; +using MathNet.Numerics.LinearAlgebra.Factorization; +using System.Data.Entity.Core.Objects.DataClasses; +using static System.Formats.Asn1.AsnWriter; +using System.Runtime; +using System.Collections.Immutable; +using MathNet.Numerics; +using System.Diagnostics.Metrics; [assembly: InternalsVisibleTo("TestFlashLFQ")] @@ -36,11 +45,14 @@ public class FlashLfqEngine public readonly bool QuantifyAmbiguousPeptides; // MBR settings + public readonly bool IsobaricCase; public readonly bool MatchBetweenRuns; public readonly double MbrRtWindow; public readonly double MbrPpmTolerance; public readonly bool RequireMsmsIdInCondition; private int _numberOfAnchorPeptidesForMbr = 3; // the number of anchor peptides used for local alignment when predicting retention times of MBR acceptor peptides + public Dictionary>> IsobaricPeakInDifferentRun; + public Dictionary IsobaricCaseDict; // settings for the Bayesian protein quantification engine public readonly bool BayesianProteinQuant; @@ -81,6 +93,7 @@ public FlashLfqEngine( int maxThreads = -1, // MBR settings + bool isobaricCase = false, bool matchBetweenRuns = false, double matchBetweenRunsPpmTolerance = 10.0, double maxMbrWindow = 2.5, @@ -119,6 +132,7 @@ public FlashLfqEngine( Silent = silent; IdSpecificChargeState = idSpecificChargeState; MbrRtWindow = maxMbrWindow; + IsobaricCase = isobaricCase; RequireMsmsIdInCondition = requireMsmsIdInCondition; Normalize = normalize; @@ -188,6 +202,85 @@ public FlashLfqResults Run() _peakIndexingEngine.ClearIndex(); } + //Ray's code + if (IsobaricCase) + { + IsobaricPeakInDifferentRun = new Dictionary>>(); + IsobaricCaseDict = new Dictionary(); + var IdGroupedbySeq = _allIdentifications.GroupBy(p => p.BaseSequence); + + Dictionary> ChromPeakInDifferentRun = new Dictionary> (); + + foreach (var spectraFile in _spectraFileInfo) + { + IsobaricCaseDict[spectraFile] = new PeakIndexingEngine(); + IsobaricCaseDict[spectraFile].IndexMassSpectralPeaks(spectraFile, Silent, _ms1Scans); + } + + foreach (var idGroup in IdGroupedbySeq) //Try to interate the all ID in the group + { + List xicGroup = new List(); + + Identification id = idGroup.GroupBy(p=>p.PrecursorChargeState).OrderBy(p=>p.Count()).Last().First(); //Try to get the Id in this group with the most common charge state + + //genrate XIC from each file + bool isFirstOne = true; //index just for choosing the first one to be the reference + foreach (var spectraFile in _spectraFileInfo) + { + + if (isFirstOne) + { + XIC xic = buildXIC(id, spectraFile, true); + if (xic!=null && xic.Ms1Peaks.Count() > 5) + { + xicGroup.Add(xic); + isFirstOne = false; + } + } + else + { + XIC xic = buildXIC(id, spectraFile, false); + if (xic != null && xic.Ms1Peaks.Count() > 5) + { + xicGroup.Add(xic); + } + } + } + + // If we have more than one XIC, we can do the MBR + if (xicGroup.Count > 1) + { + + XICGroups xICGroups = new XICGroups(xicGroup); // try to build the XICGroups + Dictionary> peakPairChorm = new Dictionary>(); + int PeakIndex = 0; // The index of the peak in the XICGroups + + + foreach (var peak in xICGroups.indexedPeaks) // try to add all of the sharedPeak in the group + { + double PeakWindow = peak.Value.Item2 - peak.Value.Item1; + if (PeakWindow > 0.001) + { + List chromPeaksInThisSequence = new List(); + CollectAllPeakInDifferentRun(peak, chromPeaksInThisSequence, xICGroups, idGroup); + peakPairChorm[PeakIndex] = chromPeaksInThisSequence; + PeakIndex++; + } + + + } + + IsobaricPeakInDifferentRun[id.BaseSequence] = peakPairChorm; + + } + + + + } + + } + + // do MBR if (MatchBetweenRuns) { @@ -869,7 +962,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } /// - /// Finds MBR acceptor peaks by looping through every possible peak for every possible charge state + /// Finds MBR acceptor peaks by looping through every possible peak for every possible charge state /// in a given retention time range. Identified peaks are added to the matchBetweenRunsIdentifiedPeaks dictionary. /// /// The MbrScorer object used to score acceptor peaks @@ -890,6 +983,7 @@ internal void FindAllAcceptorPeaks( Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + // Try to snipped the MS1 scans to the region where the analyte should appear for (int j = 0; j < ms1ScanInfos.Length; j++) { Ms1ScanInfo scan = ms1ScanInfos[j]; @@ -1163,7 +1257,7 @@ public List GetIsotopicEnvelopes( for (int i = start; i < theoreticalIsotopeAbundances.Length && i >= 0; i += direction) { double isotopeMass = identification.MonoisotopicMass + observedMassError + - theoreticalIsotopeMassShifts[i] + shift.Key * Constants.C13MinusC12; + theoreticalIsotopeMassShifts[i] + shift.Key * Chemistry.Constants.C13MinusC12; double theoreticalIsotopeIntensity = theoreticalIsotopeAbundances[i] * peak.Intensity; IndexedMassSpectralPeak isotopePeak = _peakIndexingEngine.GetIndexedPeak(isotopeMass, @@ -1240,7 +1334,7 @@ public bool CheckIsotopicEnvelopeCorrelation( continue; } - double unexpectedMass = shift.Value.Min(p => p.theorMass) - Constants.C13MinusC12; + double unexpectedMass = shift.Value.Min(p => p.theorMass) - Chemistry.Constants.C13MinusC12; IndexedMassSpectralPeak unexpectedPeak = _peakIndexingEngine.GetIndexedPeak(unexpectedMass, peak.ZeroBasedMs1ScanIndex, isotopeTolerance, chargeState); @@ -1468,5 +1562,136 @@ private void CutPeak(ChromatographicPeak peak, double identificationTime) CutPeak(peak, identificationTime); } } + + internal XIC buildXIC(Identification id, SpectraFileInfo spectraFile, bool isReference) + { + PeakIndexingEngine peakIndexingEngine = IsobaricCaseDict[spectraFile]; + PpmTolerance isotopeTolerance = new PpmTolerance(PpmTolerance); + Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[spectraFile]; + Ms1ScanInfo start = ms1ScanInfos[0]; + Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + List peaks = new List(); + + + for (int j = start.ZeroBasedMs1ScanIndex; j <= end.ZeroBasedMs1ScanIndex; j++) + { + IndexedMassSpectralPeak peak = peakIndexingEngine.GetIndexedPeak(id.PeakfindingMass, j, isotopeTolerance, id.PrecursorChargeState); + if (peak != null) + { + peaks.Add(peak); + } + } + + + XIC xIC = null; + + if (peaks.Count > 0) + { + xIC = new XIC(peaks, id.PeakfindingMass, spectraFile, isReference); + } + + return xIC; + } + + /// + /// Pick a valid peaks in the XIC, and store their region/window information into the ChromatographicPeak object. + /// Besides, we will store data from the corresponding file/Run. Finally output the the whole list of ChromatographicPeak as the result. + /// For example, we have four invaild peak in the XIC, then we look at the first peak, generate chormPeak from each file. The dictionary will be like: peak1: [chromPeak (run 1), chromPeak (run 2), chromPeak (run 3)...] + /// + /// The time window for the valid peak, format is