diff --git a/.github/workflows/dotnet.yml b/.github/workflows/dotnet.yml index 735b3557a..f89904dc3 100644 --- a/.github/workflows/dotnet.yml +++ b/.github/workflows/dotnet.yml @@ -15,7 +15,7 @@ jobs: - name: Set up .NET uses: actions/setup-dotnet@v1 with: - dotnet-version: 6.0.x + dotnet-version: 8.0.x - name: Restore dependencies run: cd mzLib && dotnet restore - name: Build @@ -25,9 +25,9 @@ jobs: - name: Build (TestFlashLFQ) run: cd mzLib && dotnet build --no-restore ./TestFlashLFQ/TestFlashLFQ.csproj - name: Add coverlet collector (Test) - run: cd mzLib && dotnet add Test/Test.csproj package coverlet.collector -v 6.0.0 + run: cd mzLib && dotnet add Test/Test.csproj package coverlet.collector -v 6.0.2 - name: Add coverlet collector (TestFlashLFQ) - run: cd mzLib && dotnet add TestFlashLFQ/TestFlashLFQ.csproj package coverlet.collector -v 6.0.0 + run: cd mzLib && dotnet add TestFlashLFQ/TestFlashLFQ.csproj package coverlet.collector -v 6.0.2 - name: Test run: cd mzLib && dotnet test --no-build --verbosity normal --collect:"XPlat Code Coverage" /p:CoverletOutputFormat=cobertura ./Test/Test.csproj - name: TestFlashLFQ diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 559174294..b5b516a83 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,7 +14,7 @@ jobs: - name: Setup .NET uses: actions/setup-dotnet@v1 with: - dotnet-version: 6.0.x + dotnet-version: 8.0.x - name: Set up NuGet uses: nuget/setup-nuget@v1 with: diff --git a/mzLib/BayesianEstimation/BayesianEstimation.csproj b/mzLib/BayesianEstimation/BayesianEstimation.csproj index e37dee889..4d0ba7ee2 100644 --- a/mzLib/BayesianEstimation/BayesianEstimation.csproj +++ b/mzLib/BayesianEstimation/BayesianEstimation.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 x64 @@ -11,6 +11,7 @@ + diff --git a/mzLib/Chemistry/ChemicalFormula.cs b/mzLib/Chemistry/ChemicalFormula.cs index 148ed209d..d9a24649f 100644 --- a/mzLib/Chemistry/ChemicalFormula.cs +++ b/mzLib/Chemistry/ChemicalFormula.cs @@ -19,9 +19,11 @@ using MzLibUtil; using System; using System.Collections.Generic; +using System.ComponentModel.DataAnnotations.Schema; using System.Globalization; using System.Linq; using System.Text; +using System.Text.Json.Serialization; using System.Text.RegularExpressions; namespace Chemistry @@ -65,7 +67,7 @@ public ChemicalFormula(IHasChemicalFormula capFormula) Elements = new Dictionary(capFormula.ThisChemicalFormula.Elements); } - public ChemicalFormula ThisChemicalFormula => this; + [JsonIgnore] public ChemicalFormula ThisChemicalFormula => this; /// /// Gets the average mass of this chemical formula diff --git a/mzLib/Chemistry/Chemistry.csproj b/mzLib/Chemistry/Chemistry.csproj index fc57948ce..816d4eefd 100644 --- a/mzLib/Chemistry/Chemistry.csproj +++ b/mzLib/Chemistry/Chemistry.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 x64 @@ -9,6 +9,10 @@ full true + + + + diff --git a/mzLib/Chemistry/ClassExtensions.cs b/mzLib/Chemistry/ClassExtensions.cs index 8bb5aecdc..7093e1f5f 100644 --- a/mzLib/Chemistry/ClassExtensions.cs +++ b/mzLib/Chemistry/ClassExtensions.cs @@ -48,6 +48,7 @@ public static double ToMass(this double massToChargeRatio, int charge) return Math.Abs(charge) * massToChargeRatio - charge * Constants.ProtonMass; } + public static double? RoundedDouble(this double myNumber, int places = 9) => RoundedDouble(myNumber as double?, places); public static double? RoundedDouble(this double? myNumber, int places = 9) { if (myNumber != null) diff --git a/mzLib/Development/Deconvolution/TestCases/TestDevelopmentTestCases.cs b/mzLib/Development/Deconvolution/TestCases/TestDevelopmentTestCases.cs index 9b62af60a..345d962c3 100644 --- a/mzLib/Development/Deconvolution/TestCases/TestDevelopmentTestCases.cs +++ b/mzLib/Development/Deconvolution/TestCases/TestDevelopmentTestCases.cs @@ -1,5 +1,6 @@ using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using MassSpectrometry; using System.Diagnostics.CodeAnalysis; using Readers; diff --git a/mzLib/Development/Development.csproj b/mzLib/Development/Development.csproj index 0dc946d45..0f0eaf199 100644 --- a/mzLib/Development/Development.csproj +++ b/mzLib/Development/Development.csproj @@ -1,16 +1,17 @@ - net6.0 + net8.0 x64 enable enable - - - + + + + diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 5d1e238d6..d7bac2195 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -1,22 +1,38 @@ -using Chemistry; -using MathNet.Numerics.Statistics; -using System; +using System; using System.Collections.Generic; using System.Linq; using System.Text; +using ClassExtensions = Chemistry.ClassExtensions; +using FlashLFQ.PEP; namespace FlashLFQ { public class ChromatographicPeak { public double Intensity; + public double ApexRetentionTime => Apex?.IndexedPeak.RetentionTime ?? -1; public readonly SpectraFileInfo SpectraFileInfo; public List IsotopicEnvelopes; + public int ScanCount => IsotopicEnvelopes.Count; public double SplitRT; public readonly bool IsMbrPeak; public double MbrScore; + public double PpmScore { get; set; } + public double IntensityScore { get; set; } + public double RtScore { get; set; } + public double ScanCountScore { get; set; } + public double IsotopicDistributionScore { get; set; } + /// + /// Stores the pearson correlation between the apex isotopic envelope and the theoretical isotopic distribution + /// + public double IsotopicPearsonCorrelation => Apex?.PearsonCorrelation ?? -1; + public double RtPredictionError { get; set; } + public List ChargeList { get; set; } + internal double MbrQValue { get; set; } + public ChromatographicPeakData PepPeakData { get; set; } + public double? MbrPep { get; set; } - public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo) + public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) { SplitRT = 0; NumChargeStatesObserved = 0; @@ -27,6 +43,14 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi IsotopicEnvelopes = new List(); IsMbrPeak = isMbrPeak; SpectraFileInfo = fileInfo; + RandomRt = randomRt; + } + + public bool Equals(ChromatographicPeak peak) + { + return SpectraFileInfo.Equals(peak.SpectraFileInfo) + && Identifications.First().ModifiedSequence.Equals(peak.Identifications.First().ModifiedSequence) + && ApexRetentionTime == peak.ApexRetentionTime; } public IsotopicEnvelope Apex { get; private set; } @@ -36,70 +60,17 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi public int NumIdentificationsByFullSeq { get; private set; } public double MassError { get; private set; } /// - /// Expected retention time for MBR acceptor peaks (mean) + /// Bool that describes whether the retention time of this peak was randomized + /// If true, implies that this peak is a decoy peak identified by the MBR algorithm /// - public double? RtHypothesis { get; private set; } - /// - /// Std. Dev of retention time differences between MBR acceptor file and donor file, used if # calibration points < 6 - /// - public double? RtStdDev { get; private set; } - /// - /// Interquartile range of retention time differences between MBR acceptor file and donor file, used if # calibration points >= 6 - /// - public double? RtInterquartileRange { get; private set; } - - public static string TabSeparatedHeader - { - get - { - var sb = new StringBuilder(); - sb.Append("File Name" + "\t"); - sb.Append("Base Sequence" + "\t"); - sb.Append("Full Sequence" + "\t"); - sb.Append("Protein Group" + "\t"); - sb.Append("Peptide Monoisotopic Mass" + "\t"); - sb.Append("MS2 Retention Time" + "\t"); - sb.Append("Precursor Charge" + "\t"); - sb.Append("Theoretical MZ" + "\t"); - sb.Append("Peak intensity" + "\t"); - sb.Append("Peak RT Start" + "\t"); - sb.Append("Peak RT Apex" + "\t"); - sb.Append("Peak RT End" + "\t"); - sb.Append("Peak MZ" + "\t"); - sb.Append("Peak Charge" + "\t"); - sb.Append("Num Charge States Observed" + "\t"); - sb.Append("Peak Detection Type" + "\t"); - sb.Append("MBR Score" + "\t"); - sb.Append("PSMs Mapped" + "\t"); - sb.Append("Base Sequences Mapped" + "\t"); - sb.Append("Full Sequences Mapped" + "\t"); - sb.Append("Peak Split Valley RT" + "\t"); - sb.Append("Peak Apex Mass Error (ppm)"); - sb.Append("\tMBR Predicted RT"); - //sb.Append("Timepoints"); - return sb.ToString(); - } - } - - /// - /// Sets retention time information for a given peak. Used for MBR peaks - /// - /// Expected retention time for peak, based on alignment between a donor and acceptor file - /// Standard deviation in the retention time differences between aligned peaks - /// Interquartile range og the retention time differences between aligned peaks - internal void SetRtWindow(double rtHypothesis, double? rtStdDev, double? rtInterquartileRange) - { - RtHypothesis = rtHypothesis; - RtStdDev = rtStdDev; - RtInterquartileRange = rtInterquartileRange; - } + public bool RandomRt { get; } + public bool DecoyPeptide => Identifications.First().IsDecoy; public void CalculateIntensityForThisFeature(bool integrate) { if (IsotopicEnvelopes.Any()) { - double maxIntensity = IsotopicEnvelopes.Max(p => p.Intensity); - Apex = IsotopicEnvelopes.First(p => p.Intensity == maxIntensity); + Apex = IsotopicEnvelopes.MaxBy(p => p.Intensity); if (integrate) { @@ -146,7 +117,7 @@ public void MergeFeatureWith(ChromatographicPeak otherFeature, bool integrate) this.Identifications = this.Identifications .Union(otherFeature.Identifications) .Distinct() - .OrderBy(p => p.PosteriorErrorProbability).ToList(); + .ToList(); ResolveIdentifications(); this.IsotopicEnvelopes.AddRange(otherFeature.IsotopicEnvelopes .Where(p => !thisFeaturesPeaks.Contains(p.IndexedPeak))); @@ -162,7 +133,40 @@ public void ResolveIdentifications() this.NumIdentificationsByBaseSeq = Identifications.Select(v => v.BaseSequence).Distinct().Count(); this.NumIdentificationsByFullSeq = Identifications.Select(v => v.ModifiedSequence).Distinct().Count(); } - + public static string TabSeparatedHeader + { + get + { + var sb = new StringBuilder(); + sb.Append("File Name" + "\t"); + sb.Append("Base Sequence" + "\t"); + sb.Append("Full Sequence" + "\t"); + sb.Append("Protein Group" + "\t"); + sb.Append("Organism" + '\t'); + sb.Append("Peptide Monoisotopic Mass" + "\t"); + sb.Append("MS2 Retention Time" + "\t"); + sb.Append("Precursor Charge" + "\t"); + sb.Append("Theoretical MZ" + "\t"); + sb.Append("Peak intensity" + "\t"); + sb.Append("Peak RT Start" + "\t"); + sb.Append("Peak RT Apex" + "\t"); + sb.Append("Peak RT End" + "\t"); + sb.Append("Peak MZ" + "\t"); + sb.Append("Peak Charge" + "\t"); + sb.Append("Num Charge States Observed" + "\t"); + sb.Append("Peak Detection Type" + "\t"); + sb.Append("PIP Q-Value" + "\t"); + sb.Append("PIP PEP" + "\t"); + sb.Append("PSMs Mapped" + "\t"); + sb.Append("Base Sequences Mapped" + "\t"); + sb.Append("Full Sequences Mapped" + "\t"); + sb.Append("Peak Split Valley RT" + "\t"); + sb.Append("Peak Apex Mass Error (ppm)" + "\t"); + sb.Append("Decoy Peptide" + "\t"); + sb.Append("Random RT"); + return sb.ToString(); + } + } public override string ToString() { StringBuilder sb = new StringBuilder(); @@ -193,10 +197,12 @@ public override string ToString() if (t.Any()) { sb.Append(string.Join(";", t) + '\t'); + sb.Append(string.Join(";", Identifications.SelectMany(id => id.ProteinGroups).Select(p => p.Organism).Distinct()) + '\t'); } else { sb.Append("" + '\t'); + sb.Append("" + '\t'); } sb.Append("" + Identifications.First().MonoisotopicMass + '\t'); @@ -243,14 +249,16 @@ public override string ToString() sb.Append("" + "MSMS" + "\t"); } - sb.Append("" + (IsMbrPeak ? MbrScore.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? MbrQValue.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? MbrPep.ToString() : "") + "\t"); sb.Append("" + Identifications.Count + "\t"); sb.Append("" + NumIdentificationsByBaseSeq + "\t"); sb.Append("" + NumIdentificationsByFullSeq + "\t"); sb.Append("" + SplitRT + "\t"); sb.Append("" + MassError); - sb.Append("\t" + (IsMbrPeak ? RtHypothesis.ToString() : "")); + sb.Append("\t" + DecoyPeptide); + sb.Append("\t" + RandomRt); return sb.ToString(); } diff --git a/mzLib/FlashLFQ/FlashLFQ.csproj b/mzLib/FlashLFQ/FlashLFQ.csproj index 52d33530e..d5c466967 100644 --- a/mzLib/FlashLFQ/FlashLFQ.csproj +++ b/mzLib/FlashLFQ/FlashLFQ.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 x64 @@ -11,8 +11,11 @@ + - + + + diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index 6261f1f29..b20daa7ea 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -15,27 +15,26 @@ public class FlashLfqResults public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; private readonly HashSet _peptideModifiedSequencesToQuantify; + public string PepResultString { get; set; } + public double MbrQValueThreshold { get; set; } - public FlashLfqResults(List spectraFiles, List identifications, HashSet peptides = null) + public FlashLfqResults(List spectraFiles, List identifications, double mbrQValueThreshold = 0.05, + HashSet peptideModifiedSequencesToQuantify = null) { SpectraFiles = spectraFiles; PeptideModifiedSequences = new Dictionary(); ProteinGroups = new Dictionary(); Peaks = new Dictionary>(); - if(peptides == null || !peptides.Any()) - { - peptides = identifications.Select(id => id.ModifiedSequence).ToHashSet(); - } - _peptideModifiedSequencesToQuantify = peptides; + MbrQValueThreshold = mbrQValueThreshold; + _peptideModifiedSequencesToQuantify = peptideModifiedSequencesToQuantify ?? identifications.Where(id => !id.IsDecoy).Select(id => id.ModifiedSequence).ToHashSet(); foreach (SpectraFileInfo file in spectraFiles) { Peaks.Add(file, new List()); } - // Only quantify peptides within the set of valid peptide modified (full) sequences. This is done to enable pepitde-level FDR control of reported results - foreach (Identification id in identifications.Where(id => peptides.Contains(id.ModifiedSequence))) + foreach (Identification id in identifications.Where(id => !id.IsDecoy & _peptideModifiedSequencesToQuantify.Contains(id.ModifiedSequence))) { if (!PeptideModifiedSequences.TryGetValue(id.ModifiedSequence, out Peptide peptide)) { @@ -59,6 +58,17 @@ public FlashLfqResults(List spectraFiles, List } } + public void ReNormalizeResults(bool integrate = false, int maxThreads = 10, bool useSharedPeptides = false) + { + foreach(var peak in Peaks.SelectMany(p => p.Value)) + { + peak.CalculateIntensityForThisFeature(integrate); + } + new IntensityNormalizationEngine(this, integrate, silent: true, maxThreads).NormalizeResults(); + CalculatePeptideResults(quantifyAmbiguousPeptides: false); + CalculateProteinResultsMedianPolish(useSharedPeptides: useSharedPeptides); + } + public void MergeResultsWith(FlashLfqResults mergeFrom) { this.SpectraFiles.AddRange(mergeFrom.SpectraFiles); @@ -128,6 +138,8 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides) { var groupedPeaks = filePeaks.Value .Where(p => p.NumIdentificationsByFullSeq == 1) + .Where(p => !p.Identifications.First().IsDecoy) + .Where(p => !p.IsMbrPeak || (p.MbrQValue < MbrQValueThreshold && !p.RandomRt)) .GroupBy(p => p.Identifications.First().ModifiedSequence) .Where(group => _peptideModifiedSequencesToQuantify.Contains(group.Key)) .ToList(); @@ -163,11 +175,15 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides) // report ambiguous quantification var ambiguousPeaks = filePeaks.Value .Where(p => p.NumIdentificationsByFullSeq > 1) + .Where(p => !p.Identifications.First().IsDecoy) + .Where(p => !p.IsMbrPeak || (p.MbrQValue < MbrQValueThreshold && !p.RandomRt)) .ToList(); foreach (ChromatographicPeak ambiguousPeak in ambiguousPeaks) { - foreach (Identification id in ambiguousPeak.Identifications) + foreach (Identification id in ambiguousPeak.Identifications.Where(id => !id.IsDecoy)) { + if (!_peptideModifiedSequencesToQuantify.Contains(id.ModifiedSequence)) continue; // Ignore the ids/sequences we don't want to quantify + string sequence = id.ModifiedSequence; double alreadyRecordedIntensity = PeptideModifiedSequences[sequence].GetIntensity(filePeaks.Key); @@ -224,7 +240,7 @@ private void HandleAmbiguityInFractions() foreach (SpectraFileInfo file in sample) { - foreach (ChromatographicPeak peak in Peaks[file]) + foreach (ChromatographicPeak peak in Peaks[file].Where(p => !p.IsMbrPeak || p.MbrQValue < MbrQValueThreshold)) { foreach (Identification id in peak.Identifications) { diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 2e877d1f7..01d3c6dc0 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -12,11 +12,21 @@ using UsefulProteomicsDatabases; using System.Runtime.CompilerServices; using Easy.Common.Extensions; +using FlashLFQ.PEP; +using System.IO; +using System.Threading; [assembly: InternalsVisibleTo("TestFlashLFQ")] namespace FlashLFQ { + public enum DonorCriterion + { + Score, + Intensity, + Neighbors + } + public class FlashLfqEngine { // settings @@ -36,10 +46,25 @@ public class FlashLfqEngine // MBR settings public readonly bool MatchBetweenRuns; - public readonly double MbrRtWindow; public readonly double MbrPpmTolerance; + public readonly double MbrDetectionQValueThreshold; + private int _numberOfAnchorPeptidesForMbr = 3; // the number of anchor peptides used for local alignment when predicting retention times of MBR acceptor peptides + + // New MBR Settings + public readonly double RtWindowIncrease = 0; + public readonly double MbrAlignmentWindow = 2.5; + public readonly double PepTrainingFraction = 0.25; + /// + /// Specifies how the donor peak for MBR is selected. + /// 'Score' selects the donor peak associated with the highest scoring PSM + /// 'Intensity' selects the donor peak with the max intensity + /// 'Neighbors' selects the donor peak with the most neighboring peaks + /// + public DonorCriterion DonorCriterion { get; init; } + public readonly double DonorQValueThreshold; public readonly bool RequireMsmsIdInCondition; + private int _randomSeed = 42; // settings for the Bayesian protein quantification engine public readonly bool BayesianProteinQuant; @@ -54,7 +79,6 @@ public class FlashLfqEngine // structures used in the FlashLFQ engine private List _spectraFileInfo; - private Stopwatch _globalStopwatch; private List _allIdentifications; /// @@ -62,7 +86,7 @@ public class FlashLfqEngine /// Other peptides may appear in the QuantifiedPeaks output, but this list is used to enable /// peptide-level FDR filtering /// - public HashSet PeptidesModifiedSequencesToQuantify { get; init; } + public HashSet PeptideModifiedSequencesToQuantify { get; init; } /// /// Dictionary linking a modified sequence to a List of tuples containing /// the mass shifts (isotope mass - monoisotopic mass) and normalized abundances for the @@ -73,6 +97,7 @@ public class FlashLfqEngine private FlashLfqResults _results; internal Dictionary _ms1Scans; internal PeakIndexingEngine _peakIndexingEngine; + internal Dictionary> DonorFileToPeakDict { get; private set; } /// /// Create an instance of FlashLFQ that will quantify peptides based on their precursor intensity in MS1 spectra @@ -96,8 +121,9 @@ public FlashLfqEngine( // MBR settings bool matchBetweenRuns = false, double matchBetweenRunsPpmTolerance = 10.0, - double maxMbrWindow = 2.5, + double maxMbrWindow = 1.0, bool requireMsmsIdInCondition = false, + double matchBetweenRunsFdrThreshold = 0.05, // settings for the Bayesian protein quantification engine bool bayesianProteinQuant = false, @@ -107,8 +133,10 @@ public FlashLfqEngine( int mcmcBurninSteps = 1000, bool useSharedPeptidesForProteinQuant = false, bool pairedSamples = false, - List peptideSequencesToUse = null, - int? randomSeed = null) + int? randomSeed = null, + DonorCriterion donorCriterion = DonorCriterion.Score, + double donorQValueThreshold = 0.01, + List peptideSequencesToQuantify = null) { Loaders.LoadElements(); @@ -123,16 +151,17 @@ public FlashLfqEngine( .ThenBy(p => p.TechnicalReplicate).ToList(); _allIdentifications = allIdentifications; + PeptideModifiedSequencesToQuantify = peptideSequencesToQuantify.IsNotNullOrEmpty() + ? new HashSet(peptideSequencesToQuantify) + : allIdentifications.Select(id => id.ModifiedSequence).ToHashSet(); PpmTolerance = ppmTolerance; IsotopePpmTolerance = isotopeTolerancePpm; - MatchBetweenRuns = matchBetweenRuns; - MbrPpmTolerance = matchBetweenRunsPpmTolerance; + Integrate = integrate; NumIsotopesRequired = numIsotopesRequired; QuantifyAmbiguousPeptides = quantifyAmbiguousPeptides; Silent = silent; IdSpecificChargeState = idSpecificChargeState; - MbrRtWindow = maxMbrWindow; RequireMsmsIdInCondition = requireMsmsIdInCondition; Normalize = normalize; MaxThreads = maxThreads; @@ -143,8 +172,14 @@ public FlashLfqEngine( McmcSteps = mcmcSteps; McmcBurninSteps = mcmcBurninSteps; UseSharedPeptidesForProteinQuant = useSharedPeptidesForProteinQuant; - PeptidesModifiedSequencesToQuantify = peptideSequencesToUse.IsNotNullOrEmpty() ? new HashSet(peptideSequencesToUse) - : allIdentifications.Select(id => id.ModifiedSequence).ToHashSet(); + + // MBR settings + MatchBetweenRuns = matchBetweenRuns; + MbrPpmTolerance = matchBetweenRunsPpmTolerance; + MbrRtWindow = maxMbrWindow; + DonorCriterion = donorCriterion; + DonorQValueThreshold = donorQValueThreshold; + MbrDetectionQValueThreshold = matchBetweenRunsFdrThreshold; RandomSeed = randomSeed; if (MaxThreads == -1 || MaxThreads >= Environment.ProcessorCount) @@ -166,7 +201,7 @@ public FlashLfqResults Run() { _globalStopwatch.Start(); _ms1Scans = new Dictionary(); - _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, PeptidesModifiedSequencesToQuantify); + _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, MbrDetectionQValueThreshold, PeptideModifiedSequencesToQuantify); // build m/z index keys CalculateTheoreticalIsotopeDistributions(); @@ -206,6 +241,8 @@ public FlashLfqResults Run() // do MBR if (MatchBetweenRuns) { + Console.WriteLine("Find the best donors for match-between-runs"); + FindPeptideDonorFiles(); foreach (var spectraFile in _spectraFileInfo) { if (!Silent) @@ -214,7 +251,6 @@ public FlashLfqResults Run() } QuantifyMatchBetweenRunsPeaks(spectraFile); - _peakIndexingEngine.ClearIndex(); if (!Silent) @@ -222,6 +258,14 @@ public FlashLfqResults Run() Console.WriteLine("Finished MBR for " + spectraFile.FilenameWithoutExtension); } } + + Console.WriteLine("Computing PEP for MBR Transfers"); + bool pepSuccesful = RunPEPAnalysis(); + + foreach (var spectraFile in _spectraFileInfo) + { + CalculateFdrForMbrPeaks(spectraFile, pepSuccesful); + } } // normalize @@ -279,6 +323,7 @@ public PeakIndexingEngine GetIndexingEngine() { return _peakIndexingEngine; } + /// /// Creates a theoretical isotope distribution for each of the identified sequences /// If the sequence is modified and the modification has an unknown chemical formula, @@ -488,6 +533,160 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) _results.Peaks[fileInfo].AddRange(chromatographicPeaks.ToList()); } + #region MatchBetweenRuns + /// + /// Used by the match-between-runs algorithm to determine systematic retention time drifts between + /// chromatographic runs. + /// + private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor, MbrScorer scorer, + out List donorFileBestMsmsPeaksOrderedByMass) + { + Dictionary donorFileBestMsmsPeaks = new(); + Dictionary acceptorFileBestMsmsPeaks = new(); + List rtCalibrationCurve = new(); + List anchorPeptideRtDiffs = new(); // anchor peptides are peptides that were MS2 detected in both the donor and acceptor runs + + Dictionary> donorFileAllMsmsPeaks = _results.Peaks[donor] + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && !peak.IsMbrPeak + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .ToDictionary(group => group.Key, group => group.ToList()); + + // iterate through each unique donor sequence + foreach (var sequencePeakListKvp in donorFileAllMsmsPeaks) + { + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + donorFileBestMsmsPeaks.Add(sequencePeakListKvp.Key, bestPeak); + } + + Dictionary> acceptorFileAllMsmsPeaks = _results.Peaks[acceptor] + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && !peak.IsMbrPeak + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .ToDictionary(group => group.Key, group => group.ToList()); + + // iterate through each acceptor sequence + foreach (var sequencePeakListKvp in acceptorFileAllMsmsPeaks) + { + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + acceptorFileBestMsmsPeaks.Add(sequencePeakListKvp.Key, bestPeak); + } + + // create RT calibration curve + foreach (var peak in acceptorFileBestMsmsPeaks) + { + ChromatographicPeak acceptorFilePeak = peak.Value; + + if (donorFileBestMsmsPeaks.TryGetValue(peak.Key, out ChromatographicPeak donorFilePeak)) + { + rtCalibrationCurve.Add(new RetentionTimeCalibDataPoint(donorFilePeak, acceptorFilePeak)); + if (donorFilePeak.ApexRetentionTime > 0 && acceptorFilePeak.ApexRetentionTime > 0) + { + anchorPeptideRtDiffs.Add(donorFilePeak.ApexRetentionTime - acceptorFilePeak.ApexRetentionTime); + } + } + } + + scorer.AddRtPredErrorDistribution(donor, anchorPeptideRtDiffs, _numberOfAnchorPeptidesForMbr); + donorFileBestMsmsPeaksOrderedByMass = donorFileBestMsmsPeaks.Select(kvp => kvp.Value).OrderBy(p => p.Identifications.First().PeakfindingMass).ToList(); + + return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); + } + + /// + /// For every MSMS identified peptide, selects one file that will be used as the donor + /// by finding files that contain the most peaks in the local neighborhood, + /// then writes the restults to the DonorFileToIdsDict. + /// WARNING! Strong assumption that this is called BEFORE MBR peaks are identified/assigned to the results + /// + private void FindPeptideDonorFiles() + { + DonorFileToPeakDict = new Dictionary>(); + + Dictionary> seqPeakDict = _results.Peaks + .SelectMany(kvp => kvp.Value) + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .Where(group => PeptideModifiedSequencesToQuantify.Contains(group.Key)) + .ToDictionary(group => group.Key, group => group.ToList()); + + // iterate through each unique sequence + foreach (var sequencePeakListKvp in seqPeakDict) + { + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + if (DonorFileToPeakDict.ContainsKey(bestPeak.SpectraFileInfo)) + { + DonorFileToPeakDict[bestPeak.SpectraFileInfo].Add(bestPeak); + } + else + { + DonorFileToPeakDict.Add(bestPeak.SpectraFileInfo, new List { bestPeak }); + } + } + } + + internal ChromatographicPeak ChooseBestPeak(List peaks) + { + ChromatographicPeak bestPeak = null; + switch (DonorCriterion) + { + case DonorCriterion.Score: // Select best peak by the PSM score + bestPeak = peaks.MaxBy(peak => peak.Identifications.Max(id => id.PsmScore)); + if (bestPeak.Identifications.First().PsmScore > 0) + break; + else // if every ID has a score of zero, let it fall through to the default case + goto default; + case DonorCriterion.Neighbors: // Select peak with the most neighboring peaks + int maxPeaks = 0; + foreach (var donorPeak in peaks) + { + // Count the number of neighboring peaks with unique peptides + int neighboringPeaksCount = _results.Peaks[donorPeak.SpectraFileInfo] + .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) + .Select(peak => peak.Identifications.First().ModifiedSequence) + .Distinct() + .Count(); + + if (neighboringPeaksCount > maxPeaks) + { + maxPeaks = neighboringPeaksCount; + bestPeak = donorPeak; + } + } + break; + case DonorCriterion.Intensity: // Select the peak with the highest intensity + default: + bestPeak = peaks.MaxBy(peak => peak.Intensity); + break; + } + + return bestPeak; + } + /// /// Used by MBR. Predicts the retention time of a peak in an acceptor file based on the /// retention time of the peak in the donor file. This is done with a local alignment @@ -503,8 +702,7 @@ internal RtInfo PredictRetentionTime( bool acceptorSampleIsFractionated, bool donorSampleIsFractionated) { - - var nearbyCalibrationPoints = new List(); + var nearbyCalibrationPoints = new List(); // The number of anchor peptides to be used for local alignment (on either side of the donor peptide) // only compare +- 1 fraction if (acceptorSampleIsFractionated && donorSampleIsFractionated) @@ -531,67 +729,70 @@ internal RtInfo PredictRetentionTime( index = rtCalibrationCurve.Length - 1; } + int numberOfForwardAnchors = 0; // gather nearby data points - for (int r = index; r < rtCalibrationCurve.Length; r++) + for (int r = index + 1; r < rtCalibrationCurve.Length; r++) { double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; - - if (Math.Abs(rtDiff) < 0.5) + if (rtCalibrationCurve[r].AcceptorFilePeak != null + && rtCalibrationCurve[r].AcceptorFilePeak.ApexRetentionTime > 0) { + if (Math.Abs(rtDiff) > 0.5) // If the rtDiff is too large, it's no longer local alignment + { + break; + } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); - } - else - { - break; + numberOfForwardAnchors++; + if (numberOfForwardAnchors >= _numberOfAnchorPeptidesForMbr) // We only want a handful of anchor points + { + break; + } } } + int numberOfBackwardsAnchors = 0; for (int r = index - 1; r >= 0; r--) { double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; - - if (Math.Abs(rtDiff) < 0.5) + if (rtCalibrationCurve[r].AcceptorFilePeak != null + && rtCalibrationCurve[r].AcceptorFilePeak.ApexRetentionTime > 0) { + if (Math.Abs(rtDiff) > 0.5) // If the rtDiff is too large, it's no longer local alignment + { + break; + } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); - } - else - { - break; + numberOfBackwardsAnchors++; + if (numberOfBackwardsAnchors >= _numberOfAnchorPeptidesForMbr) // We only want a handful of anchor points + { + break; + } } } if (!nearbyCalibrationPoints.Any()) { - return null; + // If there are no nearby calibration points, return the donor peak's RT and a width of 15 seconds + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime, width: 0.25); } // calculate difference between acceptor and donor RTs for these RT region List rtDiffs = nearbyCalibrationPoints - .Select(p => p.AcceptorFilePeak.Apex.IndexedPeak.RetentionTime - p.DonorFilePeak.Apex.IndexedPeak.RetentionTime) + .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) .ToList(); - // figure out the range of RT differences between the files that are "reasonable", centered around the median difference - double median = rtDiffs.Median(); - - // default range (if only 1 datapoint, or SD is 0, range is very high, etc) - double rtRange = MbrRtWindow; - double? rtStdDev = null; - double? rtInterquartileRange = null; - - if (nearbyCalibrationPoints.Count < 6 && nearbyCalibrationPoints.Count > 1 && rtDiffs.StandardDeviation() > 0) - { - rtStdDev = rtDiffs.StandardDeviation(); - rtRange = (double)rtStdDev * 6.0; // Multiplication inherited from legacy code, unsure of reason for 6 - } - else if (nearbyCalibrationPoints.Count >= 6 && rtDiffs.InterquartileRange() > 0) + double medianRtDiff = rtDiffs.Median(); + if(rtDiffs.Count == 1) { - rtInterquartileRange = rtDiffs.InterquartileRange(); - rtRange = (double)rtInterquartileRange * 4.5; // Multiplication inherited from legacy code, unsure of reason for 4.5 + // If there are no nearby calibration points, return the donor peak's RT and a width of 15 seconds + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: 0.25); } + double rtRange = rtDiffs.StandardDeviation() * 6; + rtRange = Math.Min(rtRange, MbrRtWindow); - return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime + median, width: rtRange, rtSd: rtStdDev, rtInterquartileRange: rtInterquartileRange); + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: rtRange); } /// @@ -606,7 +807,8 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie var apexToAcceptorFilePeakDict = new Dictionary(); List ppmErrors = new List(); foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null - && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence))) + && PeptideModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) + && p.Identifications.First().QValue < DonorQValueThreshold)) { if (!apexToAcceptorFilePeakDict.ContainsKey(peak.Apex.IndexedPeak)) { @@ -636,6 +838,56 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, logIntensityDistribution); } + /// + /// Returns a pseudo-randomly selected peak that does not have the same mass as the donor + /// + /// + /// Will search for a peak at least 5 Da away from the peakfinding mass + /// + internal ChromatographicPeak GetRandomPeak( + List peaksOrderedByMass, + double donorPeakRetentionTime, + double retentionTimeMinDiff, + Identification donorIdentification) + { + double minDiff = 5 * PeriodicTable.GetElement("H").PrincipalIsotope.AtomicMass; + double maxDiff = 11 * PeriodicTable.GetElement("H").PrincipalIsotope.AtomicMass; + double donorPeakPeakfindingMass = donorIdentification.PeakfindingMass; + + // Theoretically we could do a binary search but we're just going to iterate through the whole list of donor peaks + List randomPeakCandidates = peaksOrderedByMass + .Where(p => + p.ApexRetentionTime > 0 + && Math.Abs(p.ApexRetentionTime - donorPeakRetentionTime) > retentionTimeMinDiff + && p.Identifications.First().BaseSequence != donorIdentification.BaseSequence + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) > minDiff + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) < maxDiff) + .ToList(); + + while (!randomPeakCandidates.Any() & maxDiff < 1e5) + { + // Increase the search space by a factor of 10 and try again + maxDiff *= 10; + randomPeakCandidates = peaksOrderedByMass + .Where(p => + p.ApexRetentionTime > 0 + && Math.Abs(p.ApexRetentionTime - donorPeakRetentionTime) > retentionTimeMinDiff + && p.Identifications.First().BaseSequence != donorIdentification.BaseSequence + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) > minDiff + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) < maxDiff) + .ToList(); + } + + if (!randomPeakCandidates.Any()) + { + return null; + } + + // Generates a pseudo-random number based on the donor peak finding mass + retention time + int pseudoRandomNumber = (int)(1e5 * (donorIdentification.PeakfindingMass % 1.0) * (donorIdentification.Ms2RetentionTimeInMinutes % 1.0)) % randomPeakCandidates.Count; + return randomPeakCandidates[pseudoRandomNumber]; + } + /// /// This method maps identified peaks from other chromatographic runs ("donors") onto /// the defined chromatographic run ("acceptor"). The goal is to reduce the number of missing @@ -656,13 +908,15 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // these are the analytes already identified in this run. we don't need to try to match them from other runs var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks - .Where(p => p.IsotopicEnvelopes.Any()) + .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol); if (scorer == null) return; + mbrTol = new PpmTolerance(MbrPpmTolerance); + // deserialize the file's indexed mass spectral peaks. these were stored and serialized to a file earlier _peakIndexingEngine.DeserializeIndex(idAcceptorFile); @@ -680,24 +934,24 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } // this stores the results of MBR - var matchBetweenRunsIdentifiedPeaks = new Dictionary>>(); + ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); // map each donor file onto this file - foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) + foreach (var donorFilePeakListKvp in DonorFileToPeakDict) { - if (idAcceptorFile.Equals(idDonorFile)) + if (idAcceptorFile.Equals(donorFilePeakListKvp.Key)) { continue; } // this is the list of peaks identified in the other file but not in this one ("ID donor peaks") - List idDonorPeaks = _results.Peaks[idDonorFile].Where(p => - !p.IsMbrPeak - && p.NumIdentificationsByFullSeq == 1 - && p.IsotopicEnvelopes.Any() - && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) // Only do MBR for peptides that we want to quantify - && !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) - && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); + List idDonorPeaks = donorFilePeakListKvp.Value + .Where(p => + !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) + && (!RequireMsmsIdInCondition + || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g)))) + && this.PeptideModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence)) + .ToList(); if (!idDonorPeaks.Any()) { @@ -705,7 +959,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } bool donorSampleIsFractionated = _results.SpectraFiles - .Where(p => p.Condition == idDonorFile.Condition && p.BiologicalReplicate == idDonorFile.BiologicalReplicate) + .Where(p => p.Condition == donorFilePeakListKvp.Key.Condition && p.BiologicalReplicate == donorFilePeakListKvp.Key.BiologicalReplicate) .Select(p => p.Fraction) .Distinct() .Count() > 1; @@ -713,21 +967,22 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // We're only interested in the fold change if the conditions are different. Otherwise, we score based off of the intensities // of the acceptor file if (_spectraFileInfo.Select(p => p.Condition).Distinct().Count() > 1 - && idDonorFile.Condition != idAcceptorFile.Condition) + && donorFilePeakListKvp.Key.Condition != idAcceptorFile.Condition) { scorer.CalculateFoldChangeBetweenFiles(idDonorPeaks); } // generate RT calibration curve - RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile); + RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile, scorer, out var donorPeaksMassOrdered); + + // break if MBR transfers can't be scored + if (!scorer.IsValid(donorFilePeakListKvp.Key)) continue; // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, idDonorPeaks.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (range, loopState) => { - var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); - for (int i = range.Item1; i < range.Item2; i++) { ChromatographicPeak donorPeak = idDonorPeaks[i]; @@ -735,50 +990,91 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (rtInfo == null) continue; - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific); - } + // Look for MBR target (predicted-RT peak) + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out var bestAcceptor); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestAcceptor, donorPeak.Identifications.First()); + + //Draw a random donor that has an rt sufficiently far enough away + double minimumRtDifference = rtInfo.Width*2; + ChromatographicPeak randomDonor = GetRandomPeak(donorPeaksMassOrdered, + donorPeak.ApexRetentionTime, + minimumRtDifference, + donorPeak.Identifications.First()); + + // Look for MBR decoy (random-RT peak) + ChromatographicPeak bestDecoy = null; + RtInfo decoyRtInfo = null; + if (randomDonor != null) + { + decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + if (decoyRtInfo != null) + { + //Find a decoy peak using the randomly drawn retention time + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestDecoy, + randomRt: decoyRtInfo.PredictedRt); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestDecoy, donorPeak.Identifications.First()); + } + } - lock (matchBetweenRunsIdentifiedPeaks) - { - foreach (var kvp in matchBetweenRunsIdentifiedPeaksThreadSpecific) + double windowWidth = Math.Max(0.5, rtInfo.Width); + // If the search turned up empty, try again with a wider search window + while (bestAcceptor == null && bestDecoy == null) { - if (matchBetweenRunsIdentifiedPeaks.TryGetValue(kvp.Key, out var list)) + windowWidth = Math.Min(windowWidth, MbrRtWindow); + rtInfo.Width = windowWidth; + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestAcceptor); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestAcceptor, donorPeak.Identifications.First()); + + if(decoyRtInfo != null) + { + decoyRtInfo.Width = windowWidth; + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestDecoy, + randomRt: decoyRtInfo.PredictedRt); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestDecoy, donorPeak.Identifications.First()); + } + if (windowWidth >= MbrRtWindow) { - foreach (var peak in kvp.Value) - { - if (list.TryGetValue(peak.Key, out List existing)) - { - foreach (var acceptorPeak in peak.Value) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; - samePeakSameSequence.Identifications.Add(acceptorPeak.Identifications.First()); - } - else - { - existing.Add(acceptorPeak); - } - } - } - else - { - list.Add(peak.Key, peak.Value); - } - } + break; } else { - matchBetweenRunsIdentifiedPeaks.Add(kvp.Key, kvp.Value); + windowWidth += 0.5; } } + } }); } + // Eliminate duplicate peaks (not sure where they come from) + foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) + { + // Each isotopic envelope is linked to a list of ChromatographicPeaks + // Here, we remove instances where the same envelope is associated with multiple chromatographic peaks but the peaks correspond to the same donor peptide + // I don't know why this happens lol + // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications, then they're kept separate. + foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) + { + List bestPeaks = new(); + foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) + { + bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); + } + envelopePeakListKvp.Value.Clear(); + envelopePeakListKvp.Value.AddRange(bestPeaks); + } + } + + // Create a dictionary that stores imsPeak associated with an ms/ms identified peptide + Dictionary> msmsImsPeaks = _results.Peaks[idAcceptorFile] + .Where(peak => + !peak.DecoyPeptide + && peak.Apex?.IndexedPeak != null + && PeptideModifiedSequencesToQuantify.Contains(peak.Identifications.First().ModifiedSequence)) + .Select(peak => peak.Apex.IndexedPeak) + .GroupBy(imsPeak => imsPeak.ZeroBasedMs1ScanIndex) + .ToDictionary(g => g.Key, g => g.ToList()); + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -788,35 +1084,101 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) continue; } - List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); - - ChromatographicPeak best = peakHypotheses.First(); - - peakHypotheses.Remove(best); - - if (peakHypotheses.Count > 0) + foreach (var peakHypothesisGroup in mbrIdentifiedPeptide.Value.SelectMany(kvp => kvp.Value).OrderByDescending(p => p.MbrScore).GroupBy(p => p.RandomRt)) { - double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); - double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); + var peakHypotheses = peakHypothesisGroup.ToList(); + ChromatographicPeak best = peakHypotheses.First(); + peakHypotheses.Remove(best); - List peaksToRemoveFromHypotheses = new List(); - foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) + // Discard any peaks that are already associated with an ms/ms identified peptide + while (best?.Apex?.IndexedPeak != null && msmsImsPeaks.TryGetValue(best.Apex.IndexedPeak.ZeroBasedMs1ScanIndex, out var peakList)) { - if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) + if (peakList.Contains(best.Apex.IndexedPeak)) + { + if (!peakHypotheses.Any()) + { + best = null; + break; + } + best = peakHypotheses.First(); + peakHypotheses.Remove(best); + } + else { - best.MergeFeatureWith(peak, Integrate); + break; + } + } + if (best == null) continue; + + // merge peaks with different charge states + if (peakHypotheses.Count > 0) + { + double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); + double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); - peaksToRemoveFromHypotheses.Add(peak); + _results.Peaks[idAcceptorFile].Add(best); + foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) + { + if (peak.Apex.IndexedPeak.RetentionTime >= start + && peak.Apex.IndexedPeak.RetentionTime <= end) + //&& Math.Abs(peak.MbrScore - best.MbrScore) / best.MbrScore < 0.25)// 25% difference is a rough heuristic, but I don't want super shitty peaks being able to supercede the intensity of a good peak! + { + if (msmsImsPeaks.TryGetValue(peak.Apex.IndexedPeak.ZeroBasedMs1ScanIndex, out var peakList) && peakList.Contains(peak.Apex.IndexedPeak)) + { + continue; // If the peak is already accounted for, skip it. + } + else + { + best.MergeFeatureWith(peak, Integrate); + } + } } } + _results.Peaks[idAcceptorFile].Add(best); } - - _results.Peaks[idAcceptorFile].Add(best); } RunErrorChecking(idAcceptorFile); } + /// + /// A concurrent dictionary is used to keep track of MBR peaks that have been identified in the acceptor file. This function updates that dictionary + /// + /// concurrent dictionary. Key = Peptide sequence. Value = ConcurrentDictionary mapping where keys are isotopic envelopes and values are list of associated peaks + /// Peak to add to the dictionary + /// The donor ID associated with the MBR peaks + private void AddPeakToConcurrentDict(ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks, + ChromatographicPeak peakToSave, + Identification donorIdentification) + { + if(peakToSave == null) + { + return; + } + // save the peak hypothesis + matchBetweenRunsIdentifiedPeaks.AddOrUpdate + ( + // new key + key: donorIdentification.ModifiedSequence, + // if we are adding a value for the first time, we simply create a new dictionatry with one entry + addValueFactory: (sequenceKey) => + new ConcurrentDictionary>( + new Dictionary> + { + { peakToSave.Apex, new List { peakToSave } } + }), + // if the key (sequence) already exists, we have to add the new peak to the existing dictionary + updateValueFactory: (sequenceKey, envelopePeakListDict) => + { + envelopePeakListDict.AddOrUpdate( + key: peakToSave.Apex, + addValueFactory: (envelopeKey) => new List { peakToSave }, // if the key (envelope) doesnt exist, just create a new list + updateValueFactory: (envelopeKey, peakList) => { peakList.Add(peakToSave); return peakList; }); // if the key (envelope) already exists, add the peak to the associated list + return envelopePeakListDict; + } + ); + } + /// /// Finds MBR acceptor peaks by looping through every possible peak for every possible charge state /// in a given retention time range. Identified peaks are added to the matchBetweenRunsIdentifiedPeaks dictionary. @@ -832,20 +1194,24 @@ internal void FindAllAcceptorPeaks( RtInfo rtInfo, Tolerance fileSpecificTol, ChromatographicPeak donorPeak, - Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific) + out ChromatographicPeak bestAcceptor, + double? randomRt = null) { // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - (rtInfo.Width / 2.0); + double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + (rtInfo.Width / 2.0); + for (int j = 0; j < ms1ScanInfos.Length; j++) { Ms1ScanInfo scan = ms1ScanInfos[j]; - if (scan.RetentionTime <= rtInfo.RtStartHypothesis) + if (scan.RetentionTime <= rtStartHypothesis) { start = scan; } - if (scan.RetentionTime >= rtInfo.RtEndHypothesis) + if (scan.RetentionTime >= rtEndHypothesis) { end = scan; break; @@ -860,11 +1226,8 @@ internal void FindAllAcceptorPeaks( chargesToMatch.Add(donorPeak.Apex.ChargeState); } - Identification donorIdentification = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - Normal rtScoringDistribution = new Normal(rtInfo.PredictedRt, rtInfo.Width / 6); - - // Grab the retention time of a random peptide in the donor file - // If it is not outside of the rtInfo.predictedRT +/- rtInfo.range (twice the width of actual window), redraw + Identification donorIdentification = donorPeak.Identifications.First(); + bestAcceptor = null; foreach (int z in chargesToMatch) { @@ -879,44 +1242,20 @@ internal void FindAllAcceptorPeaks( if (!chargeXic.Any()) continue; - List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z); + List chargeEnvelopes = GetIsotopicEnvelopes(chargeXic, donorIdentification, z).OrderBy(env => env.Intensity).ToList(); // treat each isotopic envelope in the valid region as a potential seed for a chromatographic peak. // remove the clustered isotopic envelopes from the list of seeds after each iteration while (chargeEnvelopes.Any()) { - ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, fileSpecificTol, rtInfo, rtScoringDistribution, z, chargeEnvelopes); + ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, + fileSpecificTol, rtInfo, z, chargeEnvelopes, randomRt); if (acceptorPeak == null) continue; - - // save the peak hypothesis - // if this peak hypothesis already exists, sum the scores since we've mapped >1 of the same ID onto this peak - if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) - { - if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.MbrScore += acceptorPeak.MbrScore; - samePeakSameSequence.Identifications.Add(donorIdentification); - } - else - { - existing.Add(acceptorPeak); - } - } - else - { - mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); - } - } - else + if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) { - matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); - matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); + acceptorPeak.ChargeList = chargesToMatch; + bestAcceptor = acceptorPeak; } } } @@ -938,26 +1277,27 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( MbrScorer scorer, ChromatographicPeak donorPeak, Tolerance mbrTol, - RtInfo rtInfo, - Normal rtScoringDistribution, + RtInfo rtInfo, int z, - List chargeEnvelopes) + List chargeEnvelopes, + double? randomRt = null) { - var donorId = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First(); - var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile); - IsotopicEnvelope seedEnv = chargeEnvelopes.First(); + var donorId = donorPeak.Identifications.OrderBy(p => p.QValue).First(); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, randomRt != null); + // Grab the first scan/envelope from charge envelopes. This should be the most intense envelope in the list + IsotopicEnvelope seedEnv = chargeEnvelopes.First(); var xic = Peakfind(seedEnv.IndexedPeak.RetentionTime, donorId.PeakfindingMass, z, idAcceptorFile, mbrTol); List bestChargeEnvelopes = GetIsotopicEnvelopes(xic, donorId, z); acceptorPeak.IsotopicEnvelopes.AddRange(bestChargeEnvelopes); acceptorPeak.CalculateIntensityForThisFeature(Integrate); - acceptorPeak.SetRtWindow(rtInfo.PredictedRt, rtInfo.RtSd, rtInfo.RtInterquartileRange); CutPeak(acceptorPeak, seedEnv.IndexedPeak.RetentionTime); - var claimedPeaks = new HashSet(acceptorPeak.IsotopicEnvelopes.Select(p => p.IndexedPeak)); - claimedPeaks.Add(seedEnv.IndexedPeak); // prevents infinite loops - + var claimedPeaks = new HashSet(acceptorPeak.IsotopicEnvelopes.Select(p => p.IndexedPeak)) + { + seedEnv.IndexedPeak // prevents infinite loops + }; chargeEnvelopes.RemoveAll(p => claimedPeaks.Contains(p.IndexedPeak)); // peak has already been identified by MSMS - skip it @@ -966,74 +1306,11 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return null; } - acceptorPeak.MbrScore = scorer.ScoreMbr(rtScoringDistribution, - retentionTime: acceptorPeak.Apex.IndexedPeak.RetentionTime, - ppmError: acceptorPeak.MassError, - acceptorIntensity: acceptorPeak.Intensity, - donorPeak); + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, randomRt ?? rtInfo.PredictedRt); return acceptorPeak; } - /// - /// Used by the match-between-runs algorithm to determine systematic retention time drifts between - /// chromatographic runs. - /// - private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor) - { - var donorFileBestMsmsPeaks = new Dictionary(); - var acceptorFileBestMsmsPeaks = new Dictionary(); - var rtCalibrationCurve = new List(); - - // get all peaks, not counting ambiguous peaks - IEnumerable donorPeaks = _results.Peaks[donor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - IEnumerable acceptorPeaks = _results.Peaks[acceptor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - - // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in acceptorPeaks) - { - if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > acceptorPeak.Intensity) - { - acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; - } - } - else - { - acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); - } - } - - // get the best (most intense) peak for each peptide in the donor file - foreach (ChromatographicPeak donorPeak in donorPeaks) - { - if (donorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > donorPeak.Intensity) - { - donorFileBestMsmsPeaks[donorPeak.Identifications.First().ModifiedSequence] = donorPeak; - } - } - else - { - donorFileBestMsmsPeaks.Add(donorPeak.Identifications.First().ModifiedSequence, donorPeak); - } - } - - // create RT calibration curve - foreach (var peak in acceptorFileBestMsmsPeaks) - { - ChromatographicPeak acceptorFilePeak = peak.Value; - - if (donorFileBestMsmsPeaks.TryGetValue(peak.Key, out ChromatographicPeak donorFilePeak)) - { - rtCalibrationCurve.Add(new RetentionTimeCalibDataPoint(donorFilePeak, acceptorFilePeak)); - } - } - - return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); - } /// /// Checks for and resolves situations where one IndexedMassSpectralPeak is defined as the apex @@ -1053,6 +1330,7 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) // merge duplicate peaks and handle MBR/MSMS peakfinding conflicts var errorCheckedPeaksGroupedByApex = new Dictionary(); var errorCheckedPeaks = new List(); + foreach (ChromatographicPeak tryPeak in _results.Peaks[spectraFile].OrderBy(p => p.IsMbrPeak)) { tryPeak.CalculateIntensityForThisFeature(Integrate); @@ -1070,18 +1348,13 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } IndexedMassSpectralPeak apexImsPeak = tryPeak.Apex.IndexedPeak; - if (errorCheckedPeaksGroupedByApex.TryGetValue(apexImsPeak, out ChromatographicPeak storedPeak)) + if (errorCheckedPeaksGroupedByApex.TryGetValue(apexImsPeak, out ChromatographicPeak storedPeak) && storedPeak != null) { - if (tryPeak.IsMbrPeak && storedPeak == null) - { - continue; - } - if (!tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - if (PeptidesModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence)) + if (PeptideModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence)) { - if (PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) + if (PeptideModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) { storedPeak.MergeFeatureWith(tryPeak, Integrate); } @@ -1099,14 +1372,20 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - if(PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) + // Default to MSMS peaks over MBR Peaks. + // Most of these have already been eliminated + // However, sometimes merging MBR peaks with different charge states reveals that + // The MBR peak conflicts with an MSMS peak + // Removing the peak when this happens is a conservative step. + // Sometimes the MSMS peak is a decoy, or has a peptides level Q-value < 0.01 (i.e., the modified sequence isn't in PeptideModifiedSequencesToQuantify). + // In this case, we keep the MBR peak. + if (storedPeak.DecoyPeptide || !PeptideModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) { - continue; + errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak; } else { - // If the stored peak id isn't in the list of peptides to quantify, overwrite it - errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak; + continue; } } else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak) @@ -1128,10 +1407,144 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } errorCheckedPeaks.AddRange(errorCheckedPeaksGroupedByApex.Values.Where(p => p != null)); - + _results.Peaks[spectraFile] = errorCheckedPeaks; } + private bool RunPEPAnalysis() + { + List mbrPeaks = _results.Peaks.SelectMany(kvp => kvp.Value) + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .ToList(); + + if (!mbrPeaks.IsNotNullOrEmpty()) return false; + int decoyPeakTotal = mbrPeaks.Count(peak => peak.RandomRt); + + List tempPepQs = new(); + List tempQs = new(); + if (mbrPeaks.Count > 100 && decoyPeakTotal > 20) + { + PepAnalysisEngine pepAnalysisEngine = new PepAnalysisEngine(mbrPeaks, + outputFolder: Path.GetDirectoryName(_spectraFileInfo.First().FullFilePathWithExtension), + maxThreads: MaxThreads, + pepTrainingFraction: PepTrainingFraction); + var pepOutput = pepAnalysisEngine.ComputePEPValuesForAllPeaks(); + + _results.PepResultString = pepOutput; + + return true; + } + return false; + } + + /// + /// Calculates the FDR for each MBR-detected peak using decoy peaks and decoy peptides, + /// Then filters out all peaks below a given FDR threshold + /// + private void CalculateFdrForMbrPeaks(SpectraFileInfo acceptorFile, bool usePep) + { + List mbrPeaks; + if (usePep) + { + // Take only the top scoring acceptor for each donor (acceptor can be target or decoy!) + // Maybe we're sorting twice when we don't have to but idk if order is preserved using group by + mbrPeaks = _results.Peaks[acceptorFile] + .Where(peak => peak.IsMbrPeak) + .GroupBy(peak => peak.Identifications.First()) + .Select(group => group.OrderBy(peak => peak.MbrPep).ThenByDescending(peak => peak.MbrScore).First()) + .OrderBy(peak => peak.MbrPep) + .ThenByDescending(peak => peak.MbrScore) + .ToList(); + + _results.Peaks[acceptorFile] = mbrPeaks.Concat(_results.Peaks[acceptorFile].Where(peak => !peak.IsMbrPeak)).ToList(); + } + else + { + // If PEP wasn't performed, things probably aren't calibrated very well, and so it's better + // To err on the safe side and not remove the decoys + mbrPeaks = _results.Peaks[acceptorFile] + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .ToList(); + } + + if (!mbrPeaks.IsNotNullOrEmpty()) return; + + List tempQs = new(); + int totalPeaks = 0; + int decoyPeptides = 0; + int decoyPeaks = 0; + int doubleDecoys = 0; + for (int i = 0; i < mbrPeaks.Count; i++) + { + totalPeaks++; + switch (mbrPeaks[i]) + { + case ChromatographicPeak p when (!p.DecoyPeptide && !p.RandomRt): + break; + case ChromatographicPeak p when (p.DecoyPeptide && !p.RandomRt): + decoyPeptides++; + break; + case ChromatographicPeak p when (!p.DecoyPeptide && p.RandomRt): + decoyPeaks++; + break; + case ChromatographicPeak p when (p.DecoyPeptide && p.RandomRt): + doubleDecoys++; + break; + } + + // There are two parts to this score. We're summing the PEPs of peaks derived from target peptides. For peaks derived from decoy peptides, + // We do the double decoy things where we count decoyPeptidePeaks - doubleDecoypeaks + tempQs.Add(Math.Round(EstimateFdr(doubleDecoys, decoyPeptides, decoyPeaks, totalPeaks), 6)); + } + + // Set the q-value for each peak + double[] correctedQs = CorrectQValues(tempQs); + for (int i = 0; i < correctedQs.Length; i++) + { + mbrPeaks[i].MbrQValue = correctedQs[i]; + } + } + + private int EstimateDecoyPeptideErrors(int decoyPeptideCount, int doubleDecoyCount) + { + return Math.Max(0, decoyPeptideCount - doubleDecoyCount); + } + + private double EstimateFdr(int doubleDecoyCount, int decoyPeptideCount, int decoyPeakCount, int totalPeakCount) + { + return (double)(1 + decoyPeakCount + EstimateDecoyPeptideErrors(decoyPeptideCount, doubleDecoyCount)) / totalPeakCount; + } + + /// + /// Standard q-value correction, ensures that in a list of temporary q-values, a q-value is equal to + /// Min(q-values, every q-value below in the list). As you work your way down a list of q-values, the value should only increase or stay the same. + /// + /// + /// + private double[] CorrectQValues(List tempQs) + { + if (!tempQs.IsNotNullOrEmpty()) return null; + double[] correctedQValues = new double[tempQs.Count]; + correctedQValues[tempQs.Count - 1] = tempQs.Last(); + for(int i = tempQs.Count-2; i >=0; i--) + { + if (tempQs[i] > correctedQValues[i+1]) + { + correctedQValues[i] = correctedQValues[i + 1]; + } + else + { + correctedQValues[i] = tempQs[i]; + } + } + + return correctedQValues; + } + + #endregion + /// /// Takes in a list of imsPeaks and finds all the isotopic peaks in each scan. If the experimental isotopic distribution /// matches the theoretical distribution, an IsotopicEnvelope object is created from the summed intensities of each isotopic peak. @@ -1231,7 +1644,7 @@ public List GetIsotopicEnvelopes( } // Check that the experimental envelope matches the theoretical - if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance)) + if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance, out var pearsonCorr)) { // impute unobserved isotope peak intensities // TODO: Figure out why value imputation is performed. Build a toggle? @@ -1243,7 +1656,7 @@ public List GetIsotopicEnvelopes( } } - isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum())); + isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum(), pearsonCorr)); } } @@ -1264,9 +1677,10 @@ public bool CheckIsotopicEnvelopeCorrelation( Dictionary> massShiftToIsotopePeaks, IndexedMassSpectralPeak peak, int chargeState, - Tolerance isotopeTolerance) + Tolerance isotopeTolerance, + out double pearsonCorrelation) { - double pearsonCorrelation = Correlation.Pearson( + pearsonCorrelation = Correlation.Pearson( massShiftToIsotopePeaks[0].Select(p => p.expIntensity), massShiftToIsotopePeaks[0].Select(p => p.theorIntensity)); diff --git a/mzLib/FlashLFQ/Identification.cs b/mzLib/FlashLFQ/Identification.cs index 59f43a1fc..2ee9bd1b4 100644 --- a/mzLib/FlashLFQ/Identification.cs +++ b/mzLib/FlashLFQ/Identification.cs @@ -15,12 +15,15 @@ public class Identification public readonly ChemicalFormula OptionalChemicalFormula; public readonly bool UseForProteinQuant; public double PeakfindingMass; - public double PosteriorErrorProbability; + public double PsmScore { get; init; } + public double QValue { get; init; } + public bool IsDecoy { get; } public Identification(SpectraFileInfo fileInfo, string BaseSequence, string ModifiedSequence, double monoisotopicMass, double ms2RetentionTimeInMinutes, int chargeState, List proteinGroups, - ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true, double posteriorErrorProbability = 0) + ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true, + double psmScore = 0, double qValue = 0, bool decoy = false) { this.FileInfo = fileInfo; this.BaseSequence = BaseSequence; @@ -30,8 +33,10 @@ public Identification(SpectraFileInfo fileInfo, string BaseSequence, string Modi this.PrecursorChargeState = chargeState; this.ProteinGroups = new HashSet(proteinGroups); this.OptionalChemicalFormula = optionalChemicalFormula; - UseForProteinQuant = useForProteinQuant; - PosteriorErrorProbability = posteriorErrorProbability; + UseForProteinQuant = !decoy && useForProteinQuant; // ensure that decoy peptides aren't used for protein quant + QValue = qValue; + PsmScore = psmScore; + IsDecoy = decoy; } public override string ToString() diff --git a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs index c9aa89042..cdadc56eb 100644 --- a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs +++ b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs @@ -29,7 +29,7 @@ public override bool Equals(object obj) public override int GetHashCode() { - return Mz.GetHashCode(); + return HashCode.Combine(Mz, ZeroBasedMs1ScanIndex); } public override string ToString() diff --git a/mzLib/FlashLFQ/IsotopicEnvelope.cs b/mzLib/FlashLFQ/IsotopicEnvelope.cs index 09d7207d7..938ac7850 100644 --- a/mzLib/FlashLFQ/IsotopicEnvelope.cs +++ b/mzLib/FlashLFQ/IsotopicEnvelope.cs @@ -11,11 +11,12 @@ public class IsotopicEnvelope public readonly IndexedMassSpectralPeak IndexedPeak; public readonly int ChargeState; - public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity) + public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity, double pearsonCorrelation) { IndexedPeak = monoisotopicPeak; ChargeState = chargeState; Intensity = intensity / chargeState; + PearsonCorrelation = pearsonCorrelation; } /// @@ -25,6 +26,9 @@ public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeStat /// public double Intensity { get; private set; } + + public double PearsonCorrelation { get; init; } + public void Normalize(double normalizationFactor) { Intensity *= normalizationFactor; diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index 924611e64..72c2ee72d 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -1,7 +1,10 @@ -using MathNet.Numerics.Distributions; +using Easy.Common.EasyComparer; +using MathNet.Numerics.Distributions; using MathNet.Numerics.Statistics; using System; using System.Collections.Generic; +using System.Data; +using System.Data.Entity.ModelConfiguration.Conventions; using System.Linq; namespace FlashLFQ @@ -12,15 +15,18 @@ namespace FlashLFQ /// internal class MbrScorer { - // Intensity and ppm distribution are specific to each acceptor file + // Intensity and ppm distributions are specific to each acceptor file private readonly Normal _logIntensityDistribution; private readonly Normal _ppmDistribution; - // The logFcDistributions are unique to each donor file - acceptor file pair + private readonly Normal _scanCountDistribution; + private readonly Gamma _isotopicCorrelationDistribution; + // The logFcDistributions and rtDifference distributions are unique to each donor file - acceptor file pair private Dictionary _logFcDistributionDictionary; + private Dictionary _rtPredictionErrorDistributionDictionary; internal Dictionary ApexToAcceptorFilePeakDict { get; } internal List UnambiguousMsMsAcceptorPeaks { get; } - + internal double MaxNumberOfScansObserved { get; } /// /// Takes in an intensity distribution, a log foldchange distribution, and a ppm distribution @@ -28,49 +34,151 @@ internal class MbrScorer /// internal MbrScorer( Dictionary apexToAcceptorFilePeakDict, - List acceptorPeaks, + List acceptorFileMsmsPeaks, Normal ppmDistribution, Normal logIntensityDistribution) { ApexToAcceptorFilePeakDict = apexToAcceptorFilePeakDict; - UnambiguousMsMsAcceptorPeaks = acceptorPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); + UnambiguousMsMsAcceptorPeaks = acceptorFileMsmsPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); + MaxNumberOfScansObserved = acceptorFileMsmsPeaks.Max(peak => peak.ScanCount); _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; + _isotopicCorrelationDistribution = GetIsotopicEnvelopeCorrDistribution(); _logFcDistributionDictionary = new(); + _rtPredictionErrorDistributionDictionary = new(); + + // This is kludgey, because scan counts are discrete + List scanList = UnambiguousMsMsAcceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); + // build a normal distribution for the scan list of the acceptor peaks + _scanCountDistribution = new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36); + } + + /// + /// This distribution represents (1 - Pearson Correlation) for isotopic envelopes of MS/MS acceptor peaks + /// + /// + private Gamma GetIsotopicEnvelopeCorrDistribution() + { + var pearsonCorrs = UnambiguousMsMsAcceptorPeaks.Select(p => 1 - p.IsotopicPearsonCorrelation).Where(p => p > 0).ToList(); + if (pearsonCorrs.Count <= 1) return null; + double mean = pearsonCorrs.Mean(); + double variance = pearsonCorrs.Variance(); + var alpha = Math.Pow(mean, 2) / variance; + var beta = mean / variance; + return new Gamma(alpha, beta); + } + + /// + /// Takes in a list of retention time differences for anchor peptides (donor RT - acceptor RT) and uses + /// this list to calculate the distribution of prediction errors of the local RT alignment strategy employed by + /// match-between-runs for the specified donor file + /// + /// List of retention time differences (doubles) calculated as donor file RT - acceptor file RT + internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List anchorPeptideRtDiffs, int numberOfAnchorPeptides) + { + // in MBR, we use anchor peptides on either side of the donor to predict the retention time + // here, we're going to repeat the same process, using neighboring anchor peptides to predicte the Rt shift for each + // individual anchor peptide + // then, we'll check how close our predicted rt shift was to the observed rt shift + // and build a distribution based on the predicted v actual rt diffs + + double cumSumRtDiffs; + List rtPredictionErrors = new(); + + for (int i = numberOfAnchorPeptides; i < (anchorPeptideRtDiffs.Count - numberOfAnchorPeptides); i++) + { + cumSumRtDiffs = 0; + for(int j = 1; j <= numberOfAnchorPeptides; j++) + { + cumSumRtDiffs += anchorPeptideRtDiffs[i - j]; + cumSumRtDiffs += anchorPeptideRtDiffs[i + j]; + } + double avgDiff = cumSumRtDiffs / (2 * numberOfAnchorPeptides); + rtPredictionErrors.Add(avgDiff - anchorPeptideRtDiffs[i]); + } + + Normal rtPredictionErrorDist = new Normal(0, 0); + // Default distribution. Effectively assigns a RT Score of zero if no alignment can be performed + // between the donor and acceptor based on shared MS/MS IDs + + if(rtPredictionErrors.Any()) + { + double medianRtError = rtPredictionErrors.Median(); + double stdDevRtError = rtPredictionErrors.StandardDeviation(); + if(stdDevRtError >= 0.0 && !double.IsNaN(medianRtError)) + { + rtPredictionErrorDist = new Normal(medianRtError, 1); + } + } + + _rtPredictionErrorDistributionDictionary.Add(donorFile, rtPredictionErrorDist); } /// - /// Scores a MBR peak based on it's retention time, ppm error, and intensity + /// Takes in a list of retention time differences for anchor peptides (donor RT - acceptor RT) and uses + /// this list to calculate the distribution of prediction errors of the local RT alignment strategy employed by + /// match-between-runs for the specified donor file /// - /// The MBR score as a double. Higher scores are better. - internal double ScoreMbr(Normal rtDistribution, double retentionTime, double ppmError, double acceptorIntensity, ChromatographicPeak? donorPeak = null) + /// An MBR Score ranging between 0 and 100. Higher scores are better. + internal double ScoreMbr(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak, double predictedRt) + { + acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); + acceptorPeak.RtPredictionError = predictedRt - acceptorPeak.ApexRetentionTime; + acceptorPeak.RtScore = CalculateScore(_rtPredictionErrorDistributionDictionary[donorPeak.SpectraFileInfo], + acceptorPeak.RtPredictionError); + acceptorPeak.PpmScore = CalculateScore(_ppmDistribution, acceptorPeak.MassError); + acceptorPeak.ScanCountScore = CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); + acceptorPeak.IsotopicDistributionScore = CalculateScore(_isotopicCorrelationDistribution, 1 - acceptorPeak.IsotopicPearsonCorrelation); + + // Returns 100 times the geometric mean of the four scores (scan count, intensity score, rt score, ppm score) + return 100 * Math.Pow(acceptorPeak.IntensityScore + * acceptorPeak.RtScore + * acceptorPeak.PpmScore + * acceptorPeak.ScanCountScore + * acceptorPeak.IsotopicDistributionScore, 0.20); + } + + // Setting a minimum score prevents the MBR score from going to zero if one component of that score is 0 + // 3e-7 is the fraction of a normal distribution that lies at least 5 stdDev away from the mean + private double _minScore = 3e-7; + + internal double CalculateScore(Normal distribution, double value) + { + // new method + double absoluteDiffFromMean = Math.Abs(distribution.Mean - value); + // Returns a value between (0, 1] where 1 means the value was equal to the distribution mean + // The score represents the fraction of the distribution that lies absoluteDiffFromMean away from the mean or further + // i.e., what fraction of the distribution is more extreme than value + double score = 2 * distribution.CumulativeDistribution(distribution.Mean - absoluteDiffFromMean); + return (double.IsNaN(score) || score == 0) ? _minScore : score; + } + + internal double CalculateScore(Gamma distribution, double value) + { + if (value < 0 || distribution == null) + { + return _minScore; + } + + // For the gamma distribtuion, the CDF is 0 when the pearson correlation is equal to 1 (value = 0) + // The CDF then rapidly rises, reaching ~1 at a value of 0.3 (corresponding to a pearson correlation of 0.7) + return 1 - distribution.CumulativeDistribution(value); + } + + internal double CalculateIntensityScore(double acceptorIntensity, ChromatographicPeak donorPeak) { - double intensityDensity; if (donorPeak != null && acceptorIntensity != 0 && donorPeak.Intensity != 0 && _logFcDistributionDictionary.TryGetValue(donorPeak.SpectraFileInfo, out var logFcDistribution)) { - intensityDensity = logFcDistribution.Density( - Math.Log(acceptorIntensity, 2) - Math.Log(donorPeak.Intensity, 2) - ); + var logFoldChange = Math.Log(acceptorIntensity, 2) - Math.Log(donorPeak.Intensity, 2); + return CalculateScore(logFcDistribution, logFoldChange); } else { var logIntensity = Math.Log(acceptorIntensity, 2); - // I don't know what the if/else statement accomplishes. It feels like we should take the density regardless - // As it is, the score is artifically inflated for very intense peaks - if (logIntensity < _logIntensityDistribution.Median) - intensityDensity = _logIntensityDistribution.Density(logIntensity); - else - intensityDensity = _logIntensityDistribution.Density(_logIntensityDistribution.Mode); + return CalculateScore(_logIntensityDistribution, logIntensity); } - double intensityScore = DensityScoreConversion(intensityDensity); - double ppmScore = DensityScoreConversion(_ppmDistribution.Density(ppmError)); - double rtScore = DensityScoreConversion(rtDistribution.Density(retentionTime)); - - double donorIdPEP = donorPeak.Identifications.OrderBy(p => p.PosteriorErrorProbability).First().PosteriorErrorProbability; - - return (ppmScore + rtScore + intensityScore) * (1 - donorIdPEP); } /// @@ -126,17 +234,19 @@ internal void CalculateFoldChangeBetweenFiles(List idDonorP _logFcDistributionDictionary.Add(idDonorPeaks.First().SpectraFileInfo, foldChangeDistribution); } } + /// - /// Takes in the density of a normal distribution at a given point, and transforms it - /// by taking the log of the density plus the square root of the squared density plus one - /// This transformation was implemented in the original code, and we're unsure of the rationale + /// Determines whether or not the scorer is validly paramaterized and capable + /// of scoring MBR transfers originating from the given donorFile /// - /// A Normal distribution - /// The transformed score - private double DensityScoreConversion(double density) + internal bool IsValid(SpectraFileInfo donorFile) { - return Math.Log(density + Math.Sqrt(Math.Pow(density, 2) + 1)); + return _rtPredictionErrorDistributionDictionary.TryGetValue(donorFile, out var rtDist) + && rtDist != null + && _ppmDistribution != null + && _scanCountDistribution != null + && _logIntensityDistribution != null; } - + } } diff --git a/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs b/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs new file mode 100644 index 000000000..fbb8f429d --- /dev/null +++ b/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs @@ -0,0 +1,106 @@ +using Easy.Common.Extensions; +using Microsoft.ML.Data; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Text; + +namespace FlashLFQ.PEP +{ + public class ChromatographicPeakData + { + public static readonly IImmutableDictionary trainingInfos = new Dictionary + { + { "standard", new [] + { + "PpmErrorScore", + "IntensityScore", + "RtScore", + "ScanCountScore", + "IsotopicDistributionScore", + "PpmErrorRaw", + "IntensityRaw", + "RtPredictionErrorRaw", + "ScanCountRaw", + "IsotopicPearsonCorrelation" + } + }, + { "reduced", new [] + { + "PpmErrorRaw", + "IntensityRaw", + "RtPredictionErrorRaw", + "ScanCountRaw", + "IsotopicPearsonCorrelation" + } + }, + }.ToImmutableDictionary(); + + /// + /// These are used for percolator. Trainer must be told the assumed direction for each attribute as it relates to being a true positive + /// Here, a weight of 1 indicates that the probability of being true is for higher numbers in the set. + /// A weight of -1 indicates that the probability of being true is for the lower numbers in the set. + /// + public static readonly IImmutableDictionary assumedAttributeDirection = new Dictionary { + { "PpmErrorScore", 1 }, + { "IntensityScore", 1 }, + { "RtScore", 1 }, + { "ScanCountScore", 1 }, + { "IsotopicDistributionScore", 1 }, + { "PpmErrorRaw", -1 }, + { "IntensityRaw", 1 }, + { "RtPredictionErrorRaw", -1 }, + { "ScanCountRaw", -1 }, + { "IsotopicPearsonCorrelation", 1 } + }.ToImmutableDictionary(); + + public string ToString(string searchType) + { + StringBuilder sb = new StringBuilder(); + var variablesToOutput = ChromatographicPeakData.trainingInfos[searchType]; + + foreach (var variable in variablesToOutput) + { + var property = typeof(ChromatographicPeakData).GetProperty(variable).GetValue(this, null); + var floatValue = (float)property; + sb.Append("\t"); + sb.Append(floatValue.ToString()); + } + + return sb.ToString(); + } + + [LoadColumn(0)] + public float PpmErrorScore { get; set; } + + [LoadColumn(1)] + public float IntensityScore { get; set; } + + [LoadColumn(2)] + public float RtScore { get; set; } + + [LoadColumn(3)] + public float ScanCountScore { get; set; } + + [LoadColumn(4)] + public float IsotopicDistributionScore { get; set; } + + [LoadColumn(5)] + public float PpmErrorRaw { get; set; } + + [LoadColumn(6)] + public float IntensityRaw { get; set; } + + [LoadColumn(7)] + public float RtPredictionErrorRaw { get; set; } + + [LoadColumn(8)] + public float ScanCountRaw { get; set; } + + [LoadColumn(9)] + public float IsotopicPearsonCorrelation { get; set; } + + [LoadColumn(10)] + public bool Label { get; set; } + + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/PEP/DonorGroup.cs b/mzLib/FlashLFQ/PEP/DonorGroup.cs new file mode 100644 index 000000000..351bdee90 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/DonorGroup.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ.PEP +{ + /// + /// This class represents a group of chromatographic peaks that are associated with a donor identification. + /// During MBR, one donor identification is associated with multiple acceptor identifications, with both + /// predicted retention times (good MBR transfers) and random retention times (decoy MBR transfers). + /// This class groups them together for the purpose of cross-validation/PEP scoring + /// + public class DonorGroup : IEnumerable + { + public Identification DonorId { get; } + public List TargetAcceptors { get; } + public List DecoyAcceptors { get; } + + public DonorGroup(Identification donorId, List targetAcceptors, List decoyAcceptors) + { + DonorId = donorId; + TargetAcceptors = targetAcceptors; + DecoyAcceptors = decoyAcceptors; + } + + public double BestTargetMbrScore => TargetAcceptors.Count == 0 ? 0 : TargetAcceptors.Max(acceptor => acceptor.MbrScore); + + public IEnumerator GetEnumerator() + { + return TargetAcceptors.Concat(DecoyAcceptors).GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + } +} diff --git a/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs b/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs new file mode 100644 index 000000000..eaddae3e8 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs @@ -0,0 +1,647 @@ +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Omics; +using System.Collections; +using System.Security.Policy; +using System.Text.RegularExpressions; +using System.Reflection; + +namespace FlashLFQ.PEP +{ + public class PepAnalysisEngine + { + public double PipScoreCutoff; + + private static int _randomSeed = 42; + + /// + /// This method contains the hyper-parameters that will be used when training the machine learning model + /// + /// Options object to be passed in to the FastTree constructor + public FastTreeBinaryTrainer.Options BGDTreeOptions => + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 100, + MinimumExampleCountPerLeaf = 10, + NumberOfLeaves = 20, + LearningRate = 0.2, + LabelColumnName = "Label", + FeatureColumnName = "Features", + Seed = _randomSeed, + FeatureSelectionSeed = _randomSeed, + RandomStart = false, + UnbalancedSets = true + }; + + public List Peaks { get; } + public string OutputFolder { get; set; } + public int MaxThreads { get; set; } + public double PepTrainingFraction { get; set; } + + public PepAnalysisEngine(List peaks, string outputFolder, int maxThreads, double pepTrainingFraction = 0.25) + { + Peaks = peaks; + OutputFolder = outputFolder; + MaxThreads = maxThreads; + PepTrainingFraction = pepTrainingFraction; + } + + public string ComputePEPValuesForAllPeaks() + { + string[] trainingVariables = ChromatographicPeakData.trainingInfos["standard"]; + + #region Construct Donor Groups + // this is target peak not target peptide + List donors= new(); + foreach(var donorGroup in Peaks + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .GroupBy(peak => peak.Identifications.First())) //Group by donor peptide + { + var donorId = donorGroup.Key; + var targetAcceptors = donorGroup.Where(peak => !peak.RandomRt).ToList(); + var decoyAcceptors = donorGroup.Where(peak => peak.RandomRt).ToList(); + donors.Add(new DonorGroup(donorId, targetAcceptors, decoyAcceptors)); + } + + // Fix the order + donors = OrderDonorGroups(donors); + + var peakScores = donors.SelectMany(donor => donor.Select(p => p.MbrScore)).OrderByDescending(score => score).ToList(); + PipScoreCutoff = peakScores[(int)Math.Floor(peakScores.Count * PepTrainingFraction)]; //Select the top N percent of all peaks, only use those as positive examples + + MLContext mlContext = new MLContext(_randomSeed); + //the number of groups used for cross-validation is hard-coded at three. Do not change this number without changing other areas of effected code. + const int numGroups = 3; + + List[] donorGroupIndices = GetDonorGroupIndices(donors, numGroups, PipScoreCutoff); + + #endregion + + #region Create Groups and Model + IEnumerable[] ChromatographicPeakDataGroups = new IEnumerable[numGroups]; + for (int i = 0; i < numGroups; i++) + { + ChromatographicPeakDataGroups[i] = CreateChromatographicPeakData(donors, donorGroupIndices[i], MaxThreads); + + if (!ChromatographicPeakDataGroups[i].Any(p => p.Label == true) + || !ChromatographicPeakDataGroups[i].Any(p => p.Label == false)) + { + return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; + } + } + + TransformerChain>>[] trainedModels + = new TransformerChain>>[numGroups]; + + var trainer = mlContext.BinaryClassification.Trainers.FastTree(BGDTreeOptions); + var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables) + .Append(trainer); + + List allMetrics = new List(); + + #endregion + + #region Training and Cross Validation First iteration + + for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) + { + + List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); + allGroupIndexes.RemoveAt(groupIndexNumber); + + //concat doesn't work in a loop, therefore I had to hard code the concat to group 3 out of 4 lists. if the const int numGroups value is changed, then the concat has to be changed accordingly. + IDataView dataView = mlContext.Data.LoadFromEnumerable( + ChromatographicPeakDataGroups[allGroupIndexes[0]] + .Concat(ChromatographicPeakDataGroups[allGroupIndexes[1]])); + + trainedModels[groupIndexNumber] = pipeline.Fit(dataView); + var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(ChromatographicPeakDataGroups[groupIndexNumber])); + CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + + //Parallel operation of the following code requires the method to be stored and then read, once for each thread + //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation + if (OutputFolder != null) + { + mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(OutputFolder, "model.zip")); + } + + Compute_PEP_For_All_Peaks(donors, donorGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], OutputFolder, MaxThreads); + + allMetrics.Add(metrics); + } + + #endregion + #region Iterative Training + + for(int trainingIteration = 0; trainingIteration < 9; trainingIteration++) + { + ChromatographicPeakDataGroups = new IEnumerable[numGroups]; + for (int i = 0; i < numGroups; i++) + { + ChromatographicPeakDataGroups[i] = CreateChromatographicPeakDataIteration(donors, donorGroupIndices[i], MaxThreads); + + if (!ChromatographicPeakDataGroups[i].Any(p => p.Label == true) + || !ChromatographicPeakDataGroups[i].Any(p => p.Label == false)) + { + return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; + } + } + + for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) + { + List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); + allGroupIndexes.RemoveAt(groupIndexNumber); + + IDataView dataView = mlContext.Data.LoadFromEnumerable( + ChromatographicPeakDataGroups[allGroupIndexes[0]] + .Concat(ChromatographicPeakDataGroups[allGroupIndexes[1]])); + + trainedModels[groupIndexNumber] = pipeline.Fit(dataView); + var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(ChromatographicPeakDataGroups[groupIndexNumber])); + CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + + //Parallel operation of the following code requires the method to be stored and then read, once for each thread + //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation + if (OutputFolder != null) + { + mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(OutputFolder, "model.zip")); + } + + Compute_PEP_For_All_Peaks(donors, donorGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], OutputFolder, MaxThreads); + + allMetrics.Add(metrics); + } + } + #endregion + + return AggregateMetricsForOutput(allMetrics); + } + + public static List OrderDonorGroups(List donors) + { + return donors.OrderByDescending(donor => donor.TargetAcceptors.Count) + .ThenByDescending(donor => donor.DecoyAcceptors.Count) + .ThenByDescending(donor => donor.BestTargetMbrScore) + .ToList(); + } + + //we add the indexes of the targets and decoys to the groups separately in the hope that we'll get at least one target and one decoy in each group. + //then training can possibly be more successful. + public static List[] GetDonorGroupIndices(List donors, int numGroups, double scoreCutoff) + { + List[] groupsOfIndices = new List[numGroups]; + for (int i = 0; i < numGroups; i++) + { + groupsOfIndices[i] = new List(); + } + + int myIndex = 0; + + while (myIndex < donors.Count) + { + int subIndex = 0; + while (subIndex < numGroups && myIndex < donors.Count) + { + groupsOfIndices[subIndex].Add(myIndex); + + subIndex++; + myIndex++; + } + } + + EqualizeDonorGroupIndices(donors, groupsOfIndices, scoreCutoff, numGroups); + + return groupsOfIndices; + } + + /// + /// Equalizes partitions used for cross-validation. The goal is to have the same number of targets and decoys in each partition + /// + /// List of all DonorGroups to be classified + /// An array of lists. Each list contains the indices of donor groups for a given partition + /// The MBR Score cutoff that determines which MBR target peaks will be used as positive training examples + /// /// Number of groups used for cross-validation, default = 3 + public static void EqualizeDonorGroupIndices(List donors, List[] groupsOfIndices, double scoreCutoff, int numGroups = 3) + { + HashSet swappedDonors = new HashSet(); // Keep track of everything we've swapped so we don't swap it again + // Outer loop iterates over the groups of indices (partitions) three times + // after each inner loop iterations, the number of ttargtes and decoys in each adjacent group is equal, but commonly group 1 and 3 will have a different number + // of targets and decoys. Looping three times should resolve this + for (int i = 0; i < numGroups*3 - 1; i++) + { + int groupA = i % numGroups; + int groupB = (i + 1) % numGroups; + int targetsA = 0; + int targetsB = 0; + int decoysA = 0; + int decoysB = 0; + foreach (int index in groupsOfIndices[groupA]) + { + targetsA += donors[index].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff); + decoysA += donors[index].DecoyAcceptors.Count; + } + foreach (int index in groupsOfIndices[groupB]) + { + targetsB += donors[index].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff); + decoysB += donors[index].DecoyAcceptors.Count; + } + + bool stuck = false; + int outerIterations = 0; + int minIndex = groupsOfIndices[groupA].Min(); + + // Calculate the difference in targets and decoys between the two groups + int targetSurplus = targetsA - targetsB; + int decoySurplus = decoysA - decoysB; + + while ((Math.Abs(targetSurplus) > 1 | Math.Abs(decoySurplus) > 1) && !stuck && outerIterations < 3) + { + bool swapped = false; + outerIterations++; + + int innerIterations = 0; + // start from the bottom of group 1, trying to swap peaks. + // If group 1 has more targets than group 2, we want to swap groups to equalize the number of targets in each group + while (Math.Abs(targetSurplus) > 1 & !stuck & innerIterations < 3) + { + innerIterations++; + swapped = false; + // Traverse the list of donor indices in descending order, looking for a good candidate to swap + foreach (int donorIndexA in groupsOfIndices[groupA].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + int donorIndexATargetCount = donors[donorIndexA].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff); + switch (targetSurplus > 0) + { + case true: // i.e., too many targets + if (donorIndexATargetCount < 1) continue; // No targets to swap + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff) < donorIndexATargetCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + case false: // i.e., too few targets + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff) > donorIndexATargetCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + } + + // If we reach the index of the list of donorGroups, set stuck to true so that the outer loop will break + if (donorIndexA == minIndex) + { + stuck = true; + break; + } + if (swapped) + break; + + } + } + + innerIterations = 0; + // Now we'll do the decoys + while (Math.Abs(decoySurplus) > 1 & !stuck & innerIterations < 3) + { + innerIterations++; + swapped = false; + foreach (int donorIndexA in groupsOfIndices[groupA].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + int donorIndexADecoyCount = donors[donorIndexA].DecoyAcceptors.Count(); + switch (decoySurplus > 0) + { + case true: // i.e., too many decoys + if (donorIndexADecoyCount < 1) continue; // No decoys to swap + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].DecoyAcceptors.Count() < donorIndexADecoyCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + case false: // i.e., too few decoys + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].DecoyAcceptors.Count() > donorIndexADecoyCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + } + + // If we reach the index of the list of donorGroups, set stuck to true so that the outer loop will break + if (donorIndexA == minIndex) + { + stuck = true; + break; + } + if (swapped) + break; + } + } + } + } + } + + /// + /// Takes in a list of donor groups and a list of indices for each group, and swaps two groups of indices + /// Updates the targetSurplus and decoySurplus variables + /// Updates the swappedDonors hash set to keep track of which donors have been swapped + /// This is done to equalize the number of targets and decoys in each paritition for cross validation + /// + public static void GroupSwap( + List donors, + List[] groupsOfIndices, + int donorIndexA, + int donorIndexB, + int groupsOfIndicesIndexA, + int groupsOfIndicesIndexB, + double scoreCutoff, + HashSet swappedDonors, + ref int targetSurplus, + ref int decoySurplus) + { + // Multiply by two because the surplus is the difference between the two groups + // So removing one peak from one group and adding it to the other group is a difference of two + targetSurplus += 2 * ( + donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff) - + donors[donorIndexA].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff)); + decoySurplus += 2 * ( + donors[donorIndexB].DecoyAcceptors.Count - + donors[donorIndexA].DecoyAcceptors.Count); + + groupsOfIndices[groupsOfIndicesIndexA].Add(donorIndexB); + groupsOfIndices[groupsOfIndicesIndexA].Remove(donorIndexA); + groupsOfIndices[groupsOfIndicesIndexB].Add(donorIndexA); + groupsOfIndices[groupsOfIndicesIndexB].Remove(donorIndexB); + } + + /// + /// Creates chromatographic peak data that will be used to train the machine learning model + /// Classifies peaks as positive or negative training examples + /// Positive training examples are peaks with MBR scores above the 25th percentile, + /// Negative training examples are peaks with random retention times + /// + /// The list of donor groups. + /// The list of donor indices. + /// The maximum number of threads. + /// The enumerable of chromatographic peak data. + public IEnumerable CreateChromatographicPeakData(List donors, List donorIndices, int maxThreads) + { + object ChromatographicPeakDataListLock = new object(); + List ChromatographicPeakDataList = new List(); + int[] threads = Enumerable.Range(0, maxThreads).ToArray(); + + List pipScores = new(); + foreach(int i in donorIndices) + { + pipScores.AddRange(donors[i].Select(peak => peak.MbrScore)); + } + pipScores.Sort((a, b) => b.CompareTo(a)); // This is a descending sort + double groupSpecificPipScoreCutoff = pipScores[(int)Math.Floor(pipScores.Count * 0.25)]; + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + List localChromatographicPeakDataList = new List(); + for (int i = range.Item1; i < range.Item2; i++) + { + var donor = donors[donorIndices[i]]; + foreach (var peak in donor) + { + ChromatographicPeakData newChromatographicPeakData = new ChromatographicPeakData(); + if (peak.RandomRt) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: false); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + else if (!peak.RandomRt & peak.MbrScore >= groupSpecificPipScoreCutoff) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: true); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + } + } + lock (ChromatographicPeakDataListLock) + { + ChromatographicPeakDataList.AddRange(localChromatographicPeakDataList); + } + }); + + ChromatographicPeakData[] pda = ChromatographicPeakDataList.ToArray(); + + return pda.AsEnumerable(); + } + + /// + /// Creates chromatographic peak data, but uses PEP values instead of MBR scores to select the positive training examples + /// + /// The list of donor groups. + /// The list of donor indices. + /// The maximum number of threads. + /// The enumerable of chromatographic peak data. + public IEnumerable CreateChromatographicPeakDataIteration(List donors, List donorIndices, int maxThreads) + { + object ChromatographicPeakDataListLock = new object(); + List ChromatographicPeakDataList = new List(); + int[] threads = Enumerable.Range(0, maxThreads).ToArray(); + + List peps = new(); + foreach (int i in donorIndices) + { + peps.AddRange(donors[i].Select(peak => peak.MbrPep ?? 1)); + } + peps.Sort(); + double groupSpecificPepCutoff = peps[(int)Math.Floor(peps.Count * 0.25)]; + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + List localChromatographicPeakDataList = new List(); + for (int i = range.Item1; i < range.Item2; i++) + { + var donor = donors[donorIndices[i]]; + foreach (var peak in donor) + { + ChromatographicPeakData newChromatographicPeakData = new ChromatographicPeakData(); + if (peak.RandomRt) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: false); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + else if (!peak.RandomRt & peak.MbrPep <= groupSpecificPepCutoff) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: true); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + } + } + lock (ChromatographicPeakDataListLock) + { + ChromatographicPeakDataList.AddRange(localChromatographicPeakDataList); + } + }); + + ChromatographicPeakData[] pda = ChromatographicPeakDataList.ToArray(); + + return pda.AsEnumerable(); + } + + public static void Compute_PEP_For_All_Peaks( + List donors, + List donorIndices, + MLContext mLContext, + TransformerChain>> trainedModel, + string outputFolder, int maxThreads) + { + object lockObject = new object(); + + //the trained model is not threadsafe. Therefore, to use the same model for each thread saved the model to disk. Then each thread reads its own copy of the model back from disk. + //If there is no output folder specified, then this can't happen. We set maxthreads eqaul to one and use the model that gets passed into the method. + if (String.IsNullOrEmpty(outputFolder)) + { + maxThreads = 1; + } + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + + ITransformer threadSpecificTrainedModel; + if (maxThreads == 1) + { + threadSpecificTrainedModel = trainedModel; + } + else + { + threadSpecificTrainedModel = mLContext.Model.Load(Path.Combine(outputFolder, "model.zip"), out DataViewSchema savedModelSchema); + } + + // one prediction engine per thread, because the prediction engine is not thread-safe + var threadPredictionEngine = mLContext.Model.CreatePredictionEngine(threadSpecificTrainedModel); + + for (int i = range.Item1; i < range.Item2; i++) + { + DonorGroup donor = donors[donorIndices[i]]; + + foreach(ChromatographicPeak peak in donor) + { + ChromatographicPeakData pd = CreateOneChromatographicPeakDataEntry(peak, label: !peak.RandomRt); + var pepValuePrediction = threadPredictionEngine.Predict(pd); + peak.MbrPep = 1 - pepValuePrediction.Probability; + } + } + }); + } + + public static string AggregateMetricsForOutput(List allMetrics) + { + List accuracy = allMetrics.Select(m => m.Accuracy).ToList(); + List areaUnderRocCurve = allMetrics.Select(m => m.AreaUnderRocCurve).ToList(); + List areaUnderPrecisionRecallCurve = allMetrics.Select(m => m.AreaUnderPrecisionRecallCurve).ToList(); + List F1Score = allMetrics.Select(m => m.F1Score).ToList(); + List logLoss = allMetrics.Select(m => m.LogLoss).ToList(); + List logLossReduction = allMetrics.Select(m => m.LogLossReduction).ToList(); + List positivePrecision = allMetrics.Select(m => m.PositivePrecision).ToList(); + List positiveRecall = allMetrics.Select(m => m.PositiveRecall).ToList(); + List negativePrecision = allMetrics.Select(m => m.NegativePrecision).ToList(); + List negativeRecall = allMetrics.Select(m => m.NegativeRecall).ToList(); + + // log-loss can stochastically take on a value of infinity. + // correspondingly, log-loss reduction can be negative infinity. + // when this happens for one or more of the metrics, it can lead to uninformative numbers. + // so, unless they are all infinite, we remove them from the average. If they are all infinite, we report that. + + logLoss.RemoveAll(x => x == Double.PositiveInfinity); + logLossReduction.RemoveAll(x => x == Double.NegativeInfinity); + + double logLossAverage = Double.PositiveInfinity; + double logLossReductionAverage = Double.NegativeInfinity; + + if ((logLoss != null) && (logLoss.Any())) + { + logLossAverage = logLoss.Average(); + } + + if ((logLossReduction != null) && (logLossReduction.Any())) + { + logLossReductionAverage = logLossReduction.Average(); + } + + StringBuilder s = new StringBuilder(); + s.AppendLine(); + s.AppendLine("************************************************************"); + s.AppendLine("* Metrics for Determination of PEP Using Binary Classification "); + s.AppendLine("*-----------------------------------------------------------"); + s.AppendLine("* Accuracy: " + accuracy.Average().ToString()); + s.AppendLine("* Area Under Curve: " + areaUnderRocCurve.Average().ToString()); + s.AppendLine("* Area under Precision recall Curve: " + areaUnderPrecisionRecallCurve.Average().ToString()); + s.AppendLine("* F1Score: " + F1Score.Average().ToString()); + s.AppendLine("* LogLoss: " + logLossAverage.ToString()); + s.AppendLine("* LogLossReduction: " + logLossReductionAverage.ToString()); + s.AppendLine("* PositivePrecision: " + positivePrecision.Average().ToString()); + s.AppendLine("* PositiveRecall: " + positiveRecall.Average().ToString()); + s.AppendLine("* NegativePrecision: " + negativePrecision.Average().ToString()); + s.AppendLine("* NegativeRecall: " + negativeRecall.Average().ToString()); + s.AppendLine("************************************************************"); + return s.ToString(); + } + + public static ChromatographicPeakData CreateOneChromatographicPeakDataEntry(ChromatographicPeak peak,bool label) + { + + peak.PepPeakData = new ChromatographicPeakData + { + PpmErrorScore = (float)peak.PpmScore, + IntensityScore = (float)peak.IntensityScore, + RtScore = (float)peak.RtScore, + ScanCountScore = (float)peak.ScanCountScore, + IsotopicDistributionScore = (float)peak.IsotopicDistributionScore, + + PpmErrorRaw = (float)Math.Abs(peak.MassError), + IntensityRaw = (float)Math.Log2(peak.Intensity), + RtPredictionErrorRaw = (float)Math.Abs(peak.RtPredictionError), + ScanCountRaw = (float)peak.IsotopicEnvelopes.Count, + IsotopicPearsonCorrelation = (float)(peak.IsotopicPearsonCorrelation), + + Label = label, + + }; + + return peak.PepPeakData; + } + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs b/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs new file mode 100644 index 000000000..5837959d8 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs @@ -0,0 +1,18 @@ +using Microsoft.ML.Data; + +namespace FlashLFQ.PEP +{ + public class TruePositivePrediction + { + // ColumnName attribute is used to change the column name from + // its default value, which is the name of the field. + [ColumnName("PredictedLabel")] + public bool Prediction; + + // No need to specify ColumnName attribute, because the field + // name "Probability" is the column name we want. + public float Probability; + + public float Score; + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs b/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs new file mode 100644 index 000000000..396e76a9a --- /dev/null +++ b/mzLib/FlashLFQ/ResultsReading/MzLibExtensions.cs @@ -0,0 +1,64 @@ +using Readers.ExternalResults.BaseClasses; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ +{ + public static class MzLibExtensions + { + /// + /// Makes a list of identification objects usable by FlashLFQ from an IQuantifiableResultFile + /// + public static List MakeIdentifications(this IQuantifiableResultFile quantifiable) + { + IEnumerable quantifiableRecords = quantifiable.GetQuantifiableResults(); + List identifications = new List(); + Dictionary allProteinGroups = new Dictionary(); + Dictionary allFiles = new Dictionary(); + + foreach (var record in quantifiableRecords) + { + string baseSequence = record.BaseSequence; + string modifiedSequence = record.ModifiedSequence; + double ms2RetentionTimeInMinutes = record.RetentionTime; + double monoisotopicMass = record.MonoisotopicMass; + int precursurChargeState = record.ChargeState; + + SpectraFileInfo file = null; + if (allFiles.TryGetValue(record.FileName, out var fileInfo)) + { + // placeholder values for SpectraFileInfo that will be edited later + file = new SpectraFileInfo(record.FileName, "", 1, 1, 1); + } + else + { + file = new SpectraFileInfo(record.FileName, "", 1, 1, 1); + allFiles.Add(record.FileName, fileInfo); + } + + List proteinGroups = new(); + foreach (var info in record.ProteinGroupInfos) + { + if (allProteinGroups.TryGetValue(info.proteinAccessions, out var proteinGroup)) + { + proteinGroups.Add(proteinGroup); + } + else + { + allProteinGroups.Add(info.proteinAccessions, new ProteinGroup(info.proteinAccessions, info.geneName, info.organism)); + proteinGroups.Add(allProteinGroups[info.proteinAccessions]); + } + } + Identification id = new Identification(file, baseSequence, modifiedSequence, monoisotopicMass, ms2RetentionTimeInMinutes, precursurChargeState, proteinGroups); + identifications.Add(id); + + } + + return identifications; + } + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/RtInfo.cs b/mzLib/FlashLFQ/RtInfo.cs index 74c4e7b91..ceda038e4 100644 --- a/mzLib/FlashLFQ/RtInfo.cs +++ b/mzLib/FlashLFQ/RtInfo.cs @@ -9,18 +9,14 @@ namespace FlashLFQ public class RtInfo { public double PredictedRt { get; } - public double Width { get; } - public double? RtSd { get; } - public double? RtInterquartileRange { get; } + public double Width { get; set; } public double RtStartHypothesis => PredictedRt - (Width / 2.0); public double RtEndHypothesis => PredictedRt + (Width / 2.0); - public RtInfo(double predictedRt, double width, double? rtSd, double? rtInterquartileRange) + public RtInfo(double predictedRt, double width) { PredictedRt = predictedRt; Width = width; - RtSd = rtSd; - RtInterquartileRange = rtInterquartileRange; } } } diff --git a/mzLib/FlashLFQ/SpectraFileInfo.cs b/mzLib/FlashLFQ/SpectraFileInfo.cs index 728ee93b7..9cc24b6d7 100644 --- a/mzLib/FlashLFQ/SpectraFileInfo.cs +++ b/mzLib/FlashLFQ/SpectraFileInfo.cs @@ -1,13 +1,25 @@ -namespace FlashLFQ +using System.IO; + +namespace FlashLFQ { public class SpectraFileInfo { - public readonly string FullFilePathWithExtension; - public readonly string FilenameWithoutExtension; - public string Condition; - public readonly int BiologicalReplicate; - public readonly int Fraction; - public readonly int TechnicalReplicate; + /// + /// The path to the data file (e.g., a .raw file) with the extension + /// + public string FullFilePathWithExtension { get; init; } + /// + /// The name of the data file without the extension + /// + public string FilenameWithoutExtension { get; init; } + /// + /// The condition of the sample (e.g., "Control" or "Treatment") + /// + public string Condition { get; set; } + + public int BiologicalReplicate { get; set; } + public int TechnicalReplicate { get; set; } + public int Fraction { get; set; } public SpectraFileInfo(string fullFilePathWithExtension, string condition, int biorep, int techrep, int fraction) { @@ -29,5 +41,9 @@ public override int GetHashCode() { return FullFilePathWithExtension.GetHashCode(); } + public override string ToString() + { + return Path.GetFileName(FullFilePathWithExtension); + } } } \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs index 4a919a7e5..8f7bb320b 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs @@ -1,20 +1,17 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Text; -using System.Threading.Tasks; using Chemistry; -using Easy.Common.Extensions; using MathNet.Numerics.Statistics; using MzLibUtil; namespace MassSpectrometry { - public class ClassicDeconvolutionAlgorithm : DeconvolutionAlgorithm + internal class ClassicDeconvolutionAlgorithm : DeconvolutionAlgorithm { private MzSpectrum spectrum; - public ClassicDeconvolutionAlgorithm(DeconvolutionParameters deconParameters) : base(deconParameters) + internal ClassicDeconvolutionAlgorithm(DeconvolutionParameters deconParameters) : base(deconParameters) { } @@ -25,7 +22,7 @@ public ClassicDeconvolutionAlgorithm(DeconvolutionParameters deconParameters) : /// spectrum to deconvolute /// Range of peaks to deconvolute /// - public override IEnumerable Deconvolute(MzSpectrum spectrumToDeconvolute, MzRange range) + internal override IEnumerable Deconvolute(MzSpectrum spectrumToDeconvolute, MzRange range) { var deconParams = DeconvolutionParameters as ClassicDeconvolutionParameters ?? throw new MzLibException("Deconvolution params and algorithm do not match"); spectrum = spectrumToDeconvolute; @@ -205,7 +202,7 @@ private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateFor } } - return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, Statistics.StandardDeviation(listOfRatios), massIndex); + return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, listOfRatios.StandardDeviation()); } private int ObserveAdjacentChargeStates(IsotopicEnvelope originalEnvelope, double mostIntensePeakMz, int massIndex, double deconvolutionTolerancePpm, double intensityRatioLimit, double minChargeToLookFor, double maxChargeToLookFor, List monoisotopicMassPredictions) diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/DeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/DeconvolutionAlgorithm.cs index e8a052e39..1bb6bf523 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/DeconvolutionAlgorithm.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/DeconvolutionAlgorithm.cs @@ -8,6 +8,9 @@ namespace MassSpectrometry { + /// + /// Parent class defining minimum requirement to be used + /// public abstract class DeconvolutionAlgorithm { // For ClassicDeconv. If not used elsewhere, move to that class @@ -79,6 +82,6 @@ protected DeconvolutionAlgorithm(DeconvolutionParameters deconParameters) /// spectrum to be deconvoluted /// Range of peaks to deconvolute /// - public abstract IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range); + internal abstract IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range); } } diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ExampleNewDeconvolutionAlgorithmTemplate.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ExampleNewDeconvolutionAlgorithmTemplate.cs index 18957d8d0..c70c10b63 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ExampleNewDeconvolutionAlgorithmTemplate.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ExampleNewDeconvolutionAlgorithmTemplate.cs @@ -1,22 +1,19 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; -using System.Linq; -using System.Text; -using System.Threading.Tasks; using MzLibUtil; namespace MassSpectrometry { [ExcludeFromCodeCoverage] - public class ExampleNewDeconvolutionAlgorithmTemplate : DeconvolutionAlgorithm + internal class ExampleNewDeconvolutionAlgorithmTemplate : DeconvolutionAlgorithm { - public ExampleNewDeconvolutionAlgorithmTemplate(DeconvolutionParameters deconParameters) : base(deconParameters) + internal ExampleNewDeconvolutionAlgorithmTemplate(DeconvolutionParameters deconParameters) : base(deconParameters) { } - public override IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range = null) + internal override IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range = null) { var deconParams = DeconvolutionParameters as ExampleNewDeconvolutionParametersTemplate ?? throw new MzLibException("Deconvolution params and algorithm do not match"); range ??= spectrum.Range; diff --git a/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs b/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs index d419561f2..773422862 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs @@ -1,10 +1,5 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Easy.Common.Extensions; -using Easy.Common.Interfaces; +using System.Collections.Generic; +using Chemistry; using MzLibUtil; namespace MassSpectrometry @@ -30,27 +25,11 @@ public static class Deconvoluter public static IEnumerable Deconvolute(MsDataScan scan, DeconvolutionParameters deconvolutionParameters, MzRange rangeToGetPeaksFrom = null) { - rangeToGetPeaksFrom ??= scan.MassSpectrum.Range; + // set any specific deconvolution parameters found only in the MsDataScan - // set deconvolution algorithm and any specific deconvolution parameters found in the MsDataScan - DeconvolutionAlgorithm deconAlgorithm; - switch (deconvolutionParameters.DeconvolutionType) - { - case DeconvolutionType.ClassicDeconvolution: - deconAlgorithm = new ClassicDeconvolutionAlgorithm(deconvolutionParameters); - break; - - case DeconvolutionType.ExampleNewDeconvolutionTemplate: - deconAlgorithm = new ExampleNewDeconvolutionAlgorithmTemplate(deconvolutionParameters); - break; - - default: throw new MzLibException("DeconvolutionType not yet supported"); - } - - return deconAlgorithm.Deconvolute(scan.MassSpectrum, rangeToGetPeaksFrom); + foreach (var isotopicEnvelope in Deconvolute(scan.MassSpectrum, deconvolutionParameters, rangeToGetPeaksFrom)) + yield return isotopicEnvelope; } - - /// /// Static deconvolution of an MzSpectrum that does not require Deconvoluter construction @@ -79,7 +58,22 @@ public static IEnumerable Deconvolute(MzSpectrum spectrum, default: throw new MzLibException("DeconvolutionType not yet supported"); } - return deconAlgorithm.Deconvolute(spectrum, rangeToGetPeaksFrom); + // Short circuit deconvolution if it is called on a neutral mass spectrum + if (spectrum is NeutralMassSpectrum newt) + { + for (int i = 0; i < newt.XArray.Length; i++) + { + // skip this peak if it's outside the range of interest (e.g. if we're only interested in deconvoluting a small m/z range) + if (!rangeToGetPeaksFrom.Contains(newt.XArray[i].ToMz(newt.Charges[i]))) + continue; + yield return new IsotopicEnvelope(newt.XArray[i], newt.YArray[i], newt.Charges[i]); + } + } + else + { + foreach (var isotopicEnvelope in deconAlgorithm.Deconvolute(spectrum, rangeToGetPeaksFrom)) + yield return isotopicEnvelope; + } } } } diff --git a/mzLib/MassSpectrometry/Enums/DissociationType.cs b/mzLib/MassSpectrometry/Enums/DissociationType.cs index 1ac136197..ca738b3fa 100644 --- a/mzLib/MassSpectrometry/Enums/DissociationType.cs +++ b/mzLib/MassSpectrometry/Enums/DissociationType.cs @@ -109,6 +109,11 @@ public enum DissociationType /// LowCID, + /// + /// activated ion electron photo detachment dissociation + /// + aEPD, + Unknown, AnyActivationType, Custom, diff --git a/mzLib/MassSpectrometry/MassSpectrometry.csproj b/mzLib/MassSpectrometry/MassSpectrometry.csproj index d803402ea..9d8e74edc 100644 --- a/mzLib/MassSpectrometry/MassSpectrometry.csproj +++ b/mzLib/MassSpectrometry/MassSpectrometry.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 x64 @@ -11,6 +11,7 @@ + @@ -19,4 +20,8 @@ + + + + diff --git a/mzLib/MassSpectrometry/MsDataFile.cs b/mzLib/MassSpectrometry/MsDataFile.cs index 7242f2cf7..3e88fcfef 100644 --- a/mzLib/MassSpectrometry/MsDataFile.cs +++ b/mzLib/MassSpectrometry/MsDataFile.cs @@ -23,9 +23,6 @@ namespace MassSpectrometry { - // TODO: Define scope of class - // Class scope is to provide to the data loaded from the DataFile. - /// /// A class for interacting with data collected from a Mass Spectrometer, and stored in a file /// diff --git a/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs b/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs index 7e42426b1..3b2ab3d1b 100644 --- a/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs +++ b/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs @@ -14,29 +14,36 @@ public class IsotopicEnvelope : IHasMass /// /// Mass of most abundant observed isotopic peak, not accounting for addition or subtraction or protons due to ESI charge state induction /// - public double MostAbundantObservedIsotopicMass { get; private set; } + internal double MostAbundantObservedIsotopicMass { get; private set; } public readonly int Charge; public readonly double TotalIntensity; - public readonly double StDev; - public readonly int MassIndex; public double Score { get; private set; } - public IsotopicEnvelope(List<(double mz, double intensity)> bestListOfPeaks, double bestMonoisotopicMass, int bestChargeState, double bestTotalIntensity, double bestStDev, int bestMassIndex) + /// + /// Used for an isotopic envelope that mzLib deconvoluted (e.g., from a mass spectrum) + /// + public IsotopicEnvelope(List<(double mz, double intensity)> bestListOfPeaks, double bestMonoisotopicMass, int bestChargeState, double bestTotalIntensity, double bestStDev) { Peaks = bestListOfPeaks; MonoisotopicMass = bestMonoisotopicMass; - MostAbundantObservedIsotopicMass = GetMostAbundantObservedIsotopicMass(bestListOfPeaks, bestChargeState); + MostAbundantObservedIsotopicMass = bestListOfPeaks.MaxBy(p => p.intensity).mz * Math.Abs(bestChargeState); Charge = bestChargeState; TotalIntensity = bestTotalIntensity; - StDev = bestStDev; - MassIndex = bestMassIndex; - Score = ScoreIsotopeEnvelope(); + Score = ScoreIsotopeEnvelope(bestStDev); } - public double GetMostAbundantObservedIsotopicMass(List<(double mz, double intensity)> peaks, int charge) + /// + /// Used for a neutral mass read in from a deconvoluted file + /// Assumes the mass is correct: score is max value + /// + public IsotopicEnvelope(double monoisotopicMass, double intensity, int charge) { - return peaks.MaxBy(p => p.intensity).mz * Math.Abs(charge); + MonoisotopicMass = monoisotopicMass; + Charge = charge; + TotalIntensity = intensity; + Score = double.MaxValue; + Peaks = [(monoisotopicMass.ToMz(charge), intensity)]; } public override string ToString() @@ -44,10 +51,10 @@ public override string ToString() return Charge + "\t" + Peaks[0].mz.ToString("G8") + "\t" + Peaks.Count + "\t" + TotalIntensity; } - private double ScoreIsotopeEnvelope() //likely created by Stefan Solntsev using peptide data + private double ScoreIsotopeEnvelope(double stDev) //likely created by Stefan Solntsev using peptide data { return Peaks.Count >= 2 ? - TotalIntensity / Math.Pow(StDev, 0.13) * Math.Pow(Peaks.Count, 0.4) / Math.Pow(Math.Abs(Charge), 0.06) : + TotalIntensity / Math.Pow(stDev, 0.13) * Math.Pow(Peaks.Count, 0.4) / Math.Pow(Math.Abs(Charge), 0.06) : 0; } @@ -60,6 +67,5 @@ public void SetMedianMonoisotopicMass(List monoisotopicMassPredictions) { MonoisotopicMass = monoisotopicMassPredictions.Median(); } - } } \ No newline at end of file diff --git a/mzLib/MassSpectrometry/MzSpectra/MzSpectrum.cs b/mzLib/MassSpectrometry/MzSpectra/MzSpectrum.cs index 2e9fcc7a4..88a97d1ac 100644 --- a/mzLib/MassSpectrometry/MzSpectra/MzSpectrum.cs +++ b/mzLib/MassSpectrometry/MzSpectra/MzSpectrum.cs @@ -22,10 +22,8 @@ using System; using System.Collections; using System.Collections.Generic; -using System.Diagnostics; using System.IO; using System.Linq; -using System.Text.Json; namespace MassSpectrometry { @@ -126,7 +124,7 @@ public MzRange Range } } - public double? FirstX + public virtual double? FirstX { get { @@ -138,7 +136,7 @@ public double? FirstX } } - public double? LastX + public virtual double? LastX { get { @@ -373,7 +371,7 @@ public IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateForM } } - return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, Statistics.StandardDeviation(listOfRatios), massIndex); + return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, listOfRatios.StandardDeviation()); } [Obsolete("Deconvolution Has been moved to the Deconvoluter Object")] @@ -796,7 +794,12 @@ private MzPeak GetPeak(int index) return peakList[index]; } - private MzPeak GeneratePeak(int index) + /// + /// The source of all peaks which populate the peakList + /// + /// + /// + protected virtual MzPeak GeneratePeak(int index) { return new MzPeak(XArray[index], YArray[index]); } diff --git a/mzLib/MassSpectrometry/MzSpectra/NeutralMassSpectrum.cs b/mzLib/MassSpectrometry/MzSpectra/NeutralMassSpectrum.cs new file mode 100644 index 000000000..dcb5d7d2b --- /dev/null +++ b/mzLib/MassSpectrometry/MzSpectra/NeutralMassSpectrum.cs @@ -0,0 +1,65 @@ +using System; +using Chemistry; + +namespace MassSpectrometry +{ + public class NeutralMassSpectrum : MzSpectrum + { + public int[] Charges { get; init; } + public NeutralMassSpectrum(double[,] monoisotopicMassesIntensities, int[] charges) : base(monoisotopicMassesIntensities) + { + if (monoisotopicMassesIntensities.GetLength(0) != charges.Length) + throw new ArgumentException("The lengths of monoisotopicMasses, intensities, and charges must be the same."); + + Charges = charges; + + double minMz = double.MaxValue; + double maxMz = double.MinValue; + for (int i = 0; i < monoisotopicMassesIntensities.GetLength(0); i++) + { + var mz = monoisotopicMassesIntensities[i,0].ToMz(charges[i]); + if (mz < minMz) + minMz = mz; + if (mz > maxMz) + maxMz = mz; + } + + FirstX = minMz; + LastX = maxMz; + } + + public NeutralMassSpectrum(double[] monoisotopicMasses, double[] intensities, int[] charges, bool shouldCopy) + : base(monoisotopicMasses, intensities, shouldCopy) + { + if (monoisotopicMasses.GetLength(0) != intensities.Length || monoisotopicMasses.Length != charges.Length) + throw new ArgumentException("The lengths of monoisotopicMasses, intensities, and charges must be the same."); + + Charges = charges; + + double minMz = double.MaxValue; + double maxMz = double.MinValue; + for (int i = 0; i < monoisotopicMasses.Length; i++) + { + var mz = monoisotopicMasses[i].ToMz(charges[i]); + if (mz < minMz) + minMz = mz; + if (mz > maxMz) + maxMz = mz; + } + + FirstX = minMz; + LastX = maxMz; + } + + public override double? FirstX { get; } // in m/z + public override double? LastX { get; } // in m/z + + /// + /// Converts to a charged spectrum + /// + protected override MzPeak GeneratePeak(int index) + { + return new MzPeak(XArray[index].ToMz(Charges[index]), YArray[index]); + } + } +} diff --git a/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs b/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs index 189fc0beb..6dde46ce6 100644 --- a/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs +++ b/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs @@ -8,7 +8,7 @@ namespace MassSpectrometry.MzSpectra { public class SpectralSimilarity { - public SpectralSimilarity(MzSpectrum experimentalSpectrum, MzSpectrum theoreticalSpectrum, SpectrumNormalizationScheme scheme, double toleranceInPpm, bool keepAllExperimentalPeaks = false, bool keepAllTheoreticalPeaks = true, double filterOutBelowThisMz = 300) + public SpectralSimilarity(MzSpectrum experimentalSpectrum, MzSpectrum theoreticalSpectrum, SpectrumNormalizationScheme scheme, double toleranceInPpm, bool allPeaks, double filterOutBelowThisMz = 300) { ExperimentalYArray = Normalize(FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(experimentalSpectrum.XArray,experimentalSpectrum.YArray, filterOutBelowThisMz).Select(p=>p.Item2).ToArray(),scheme); ExperimentalXArray = FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(experimentalSpectrum.XArray, experimentalSpectrum.YArray, filterOutBelowThisMz).Select(p => p.Item1).ToArray(); @@ -16,12 +16,11 @@ public SpectralSimilarity(MzSpectrum experimentalSpectrum, MzSpectrum theoretica TheoreticalXArray = FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, filterOutBelowThisMz).Select(p => p.Item1).ToArray(); _localPpmTolerance = toleranceInPpm; _scheme = scheme; - _keepAllExperimentalPeaks = keepAllExperimentalPeaks; - _keepAllTheoreticalPeaks = keepAllTheoreticalPeaks; - IntensityPairs = GetIntensityPairs(keepAllExperimentalPeaks, keepAllTheoreticalPeaks); + _allPeaks = allPeaks; + IntensityPairs = GetIntensityPairs(allPeaks); } - public SpectralSimilarity(MzSpectrum experimentalSpectrum, IReadOnlyList theoreticalX, IReadOnlyList theoreticalY, SpectrumNormalizationScheme scheme, double toleranceInPpm, bool keepAllExperimentalPeaks = false, bool keepAllTheoreticalPeaks = true, double filterOutBelowThisMz = 300) + public SpectralSimilarity(MzSpectrum experimentalSpectrum, IReadOnlyList theoreticalX, IReadOnlyList theoreticalY, SpectrumNormalizationScheme scheme, double toleranceInPpm, bool allPeaks, double filterOutBelowThisMz = 300) { ExperimentalYArray = Normalize(FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(experimentalSpectrum.XArray, experimentalSpectrum.YArray, filterOutBelowThisMz).Select(p => p.Item2).ToArray(), scheme); ExperimentalXArray = FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(experimentalSpectrum.XArray, experimentalSpectrum.YArray, filterOutBelowThisMz).Select(p => p.Item1).ToArray(); @@ -29,12 +28,11 @@ public SpectralSimilarity(MzSpectrum experimentalSpectrum, IReadOnlyList TheoreticalXArray = FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(theoreticalX, theoreticalY, filterOutBelowThisMz).Select(p => p.Item1).ToArray(); _localPpmTolerance = toleranceInPpm; _scheme = scheme; - _keepAllExperimentalPeaks = keepAllExperimentalPeaks; - _keepAllTheoreticalPeaks = keepAllTheoreticalPeaks; - IntensityPairs = GetIntensityPairs(keepAllExperimentalPeaks, keepAllTheoreticalPeaks); + _allPeaks = allPeaks; + IntensityPairs = GetIntensityPairs(allPeaks); } - public SpectralSimilarity(IReadOnlyList pXArray, IReadOnlyList pYArray, IReadOnlyList qXArray, IReadOnlyList qYArray, SpectrumNormalizationScheme scheme, double toleranceInPpm, bool keepAllExperimentalPeaks = false, bool keepAllTheoreticalPeaks = true, double filterOutBelowThisMz = 300) + public SpectralSimilarity(IReadOnlyList pXArray, IReadOnlyList pYArray, IReadOnlyList qXArray, IReadOnlyList qYArray, SpectrumNormalizationScheme scheme, double toleranceInPpm, bool allPeaks, double filterOutBelowThisMz = 300) { ExperimentalYArray = Normalize(FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(pXArray, pYArray, filterOutBelowThisMz).Select(p => p.Item2).ToArray(), scheme); ExperimentalXArray = FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(pXArray, pYArray, filterOutBelowThisMz).Select(p => p.Item1).ToArray(); @@ -42,9 +40,8 @@ public SpectralSimilarity(IReadOnlyList pXArray, IReadOnlyList p TheoreticalXArray = FilterOutIonsBelowThisMzAndRemoveZeroIntensityPeaks(qXArray, qYArray, filterOutBelowThisMz).Select(p => p.Item1).ToArray(); _localPpmTolerance = toleranceInPpm; _scheme = scheme; - _keepAllExperimentalPeaks = keepAllExperimentalPeaks; - _keepAllTheoreticalPeaks = keepAllTheoreticalPeaks; - IntensityPairs = GetIntensityPairs(keepAllExperimentalPeaks, keepAllTheoreticalPeaks); + _allPeaks = allPeaks; + IntensityPairs = GetIntensityPairs(allPeaks); } public double[] ExperimentalYArray { get; } public double[] ExperimentalXArray { get; } @@ -53,7 +50,7 @@ public SpectralSimilarity(IReadOnlyList pXArray, IReadOnlyList p private readonly double _localPpmTolerance; private readonly SpectrumNormalizationScheme _scheme; - private readonly bool _keepAllExperimentalPeaks; + private readonly bool _allPeaks; private readonly bool _keepAllTheoreticalPeaks; public List<(double, double)> IntensityPairs { get; } = new(); @@ -116,7 +113,7 @@ private static double[] Normalize(double[] spectrum, SpectrumNormalizationScheme /// /// /// - private List<(double,double)> GetIntensityPairs(bool keepAllExperimentalPeaks, bool keepAllTheoreticalPeaks, double[] experimentalYArray = null, double[] theoreticalYArray = null) + private List<(double,double)> GetIntensityPairs(bool allPeaks, double[] experimentalYArray = null, double[] theoreticalYArray = null) { if (experimentalYArray == null) experimentalYArray = ExperimentalYArray; if (theoreticalYArray == null) theoreticalYArray = TheoreticalYArray; @@ -126,54 +123,55 @@ private static double[] Normalize(double[] spectrum, SpectrumNormalizationScheme //when all mz of theoretical peaks or experimental peaks are less than mz cut off , it is treated as no corresponding library spectrum is found and later the similarity score will be assigned as null. return new List<(double, double)> { (-1, -1) }; } + List<(double, double)> intensityPairs = new(); - int expIndex = 0; - int theoIndex = 0; - do + List<(double, double)> experimental = new(); + List<(double, double)> theoretical = new(); + + for (int i = 0; i < ExperimentalXArray.Length; i++) { - if (Within(ExperimentalXArray[expIndex], TheoreticalXArray[theoIndex])) - { - intensityPairs.Add((experimentalYArray[expIndex], theoreticalYArray[theoIndex])); - expIndex++; - theoIndex++; - } - else if(ExperimentalXArray[expIndex] < TheoreticalXArray[theoIndex]) + experimental.Add((ExperimentalXArray[i], experimentalYArray[i])); + } + for (int i = 0; i < TheoreticalXArray.Length; i++) + { + theoretical.Add((TheoreticalXArray[i], theoreticalYArray[i])); + } + + experimental = experimental.OrderByDescending(i => i.Item2).ToList(); + theoretical = theoretical.OrderByDescending(i => i.Item2).ToList(); + + foreach ((double, double) xyPair in theoretical) + { + int index = 0; + while (experimental.Count > 0 && index < experimental.Count) { - if (keepAllExperimentalPeaks) + if (Within(experimental[index].Item1, xyPair.Item1)) { - intensityPairs.Add((experimentalYArray[expIndex], 0)); + intensityPairs.Add((experimental[index].Item2, xyPair.Item2)); + experimental.RemoveAt(index); + index = -1; + break; } - expIndex++; + index++; } - else + if (experimental.Count == 0) { - if (keepAllTheoreticalPeaks) - { - intensityPairs.Add((0, theoreticalYArray[theoIndex])); - } - theoIndex++; + index++; + } + if (index > 0) + { + //didn't find a experimental mz in range + intensityPairs.Add((0, xyPair.Item2)); } } - while (expIndex < ExperimentalXArray.Length && theoIndex < TheoreticalXArray.Length); - //if the theoretical peak count is different than the experimental peak count, and the bool createPairsForAllExperimentalMzValues = TRUE then - //we need to add zero intensity pairs for each experimental peak that does not have a corresponding theoretical peak - while (expIndex < ExperimentalXArray.Length && keepAllExperimentalPeaks) - { - intensityPairs.Add((experimentalYArray[expIndex], 0)); - expIndex++; - } - //We add an intensity pair for every value in the theoretical spectrum. - while (theoIndex < TheoreticalXArray.Length && keepAllTheoreticalPeaks) + //If we're keeping all experimental and theoretical peaks, then we add intensity pairs for all unpaired experimental peaks here. + if (experimental.Count > 0 && allPeaks) { - intensityPairs.Add((0, theoreticalYArray[theoIndex])); - theoIndex++; - } - - //if there are no intensity pairs, then we are required to return a single pair of (-1,-1) to indicate that no peaks were found - if (intensityPairs.Count == 0) - { - intensityPairs.Add((-1, -1)); + foreach ((double, double) xyPair in experimental) + { + intensityPairs.Add((xyPair.Item2, 0)); + } } return intensityPairs; } @@ -330,7 +328,7 @@ public static double[] NormalizeSpectrumSum(double[] spectrum) // This method should only be used when allPeaks is set to true public double? SpectralEntropy() { - if (_scheme != SpectrumNormalizationScheme.SpectrumSum && !_keepAllExperimentalPeaks && !_keepAllTheoreticalPeaks) + if (_scheme != SpectrumNormalizationScheme.SpectrumSum && !_allPeaks && !_keepAllTheoreticalPeaks) { return null; } diff --git a/mzLib/MzIdentML/MzIdentML.csproj b/mzLib/MzIdentML/MzIdentML.csproj index 6adfa72d7..67eb6987f 100644 --- a/mzLib/MzIdentML/MzIdentML.csproj +++ b/mzLib/MzIdentML/MzIdentML.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 x64 @@ -9,6 +9,10 @@ full true + + + + diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 05c94a2f9..0129154a4 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -19,6 +19,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Text.RegularExpressions; namespace MzLibUtil { @@ -101,5 +102,26 @@ public static bool AllSame(this IEnumerable list) return true; } + /// + /// Finds the index of all instances of a specified substring within the source string. + /// The index returned is the position of the first character of the substring within the source tring + /// + /// Haystack: string to be searched + /// Needle: substring to be located + public static IEnumerable IndexOfAll(this string sourceString, string subString) + { + return Regex.Matches(sourceString, subString).Cast().Select(m => m.Index); + } + + /// + /// Extension method to invoke the GetPeriodTolerantFileNameWithoutExtension method + /// + /// + /// + public static string GetPeriodTolerantFilenameWithoutExtension(this string filePath) + { + return PeriodTolerantFilenameWithoutExtension.GetPeriodTolerantFilenameWithoutExtension(filePath); + } + } } \ No newline at end of file diff --git a/mzLib/MzLibUtil/MzLibException.cs b/mzLib/MzLibUtil/MzLibException.cs index cf86074d8..885081433 100644 --- a/mzLib/MzLibUtil/MzLibException.cs +++ b/mzLib/MzLibUtil/MzLibException.cs @@ -1,13 +1,9 @@ -using System; +#nullable enable +using System; namespace MzLibUtil { [Serializable] - public class MzLibException : Exception - { - public MzLibException(string message) - : base(message) - { - } - } + public class MzLibException(string message, Exception? innerException = null) + : Exception(message, innerException); } \ No newline at end of file diff --git a/mzLib/MzLibUtil/MzLibUtil.csproj b/mzLib/MzLibUtil/MzLibUtil.csproj index 84d202daf..c6b5cf526 100644 --- a/mzLib/MzLibUtil/MzLibUtil.csproj +++ b/mzLib/MzLibUtil/MzLibUtil.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 x64 @@ -11,7 +11,8 @@ - + + diff --git a/mzLib/Omics/Fragmentation/FragmentationTerminus.cs b/mzLib/Omics/Fragmentation/FragmentationTerminus.cs index 146309caa..788041690 100644 --- a/mzLib/Omics/Fragmentation/FragmentationTerminus.cs +++ b/mzLib/Omics/Fragmentation/FragmentationTerminus.cs @@ -1,19 +1,12 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Omics.Fragmentation +namespace Omics.Fragmentation { public enum FragmentationTerminus - { - Both, //N- and C-terminus - N, //N-terminus only - C, //C-terminus only + { + Both, //N- and C-terminus + N, //N-terminus only + C, //C-terminus only None, //used for internal fragments, could be used for top down intact mass? FivePrime, // 5' for NucleicAcids ThreePrime, // 3' for NucleicAcids - } - + } } diff --git a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs index d5b020160..4302fadcb 100644 --- a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs +++ b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs @@ -1 +1,161 @@ -using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using Chemistry; using MassSpectrometry; namespace Omics.Fragmentation.Oligo { /// /// Methods dealing with specific product type for RNA molecules /// public static class DissociationTypeCollection { /// /// Product Ion types by dissociation method /// private static readonly Dictionary> ProductsFromDissociationType = new Dictionary>() { { DissociationType.Unknown, new List() }, { DissociationType.CID, new List { ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.LowCID, new List() { } }, { DissociationType.IRMPD, new List() { } }, { DissociationType.ECD, new List { } }, { DissociationType.PQD, new List { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.yWaterLoss, ProductType.d, ProductType.M } }, { DissociationType.ETD, new List { } }, { DissociationType.HCD, new List { ProductType.w, ProductType.y, ProductType.aBaseLoss, ProductType.dWaterLoss, ProductType.M } }, { DissociationType.AnyActivationType, new List { } }, { DissociationType.EThcD, new List { } }, { DissociationType.Custom, new List { } }, { DissociationType.ISCID, new List { } } }; /// /// Returns list of products types based upon the dissociation type /// /// /// public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => ProductsFromDissociationType[dissociationType]; /// /// Mass to be added or subtracted /// private static readonly Dictionary FragmentIonCaps = new Dictionary { { ProductType.a, ChemicalFormula.ParseFormula("H") }, { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.b, ChemicalFormula.ParseFormula("OH") }, { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, { ProductType.w, ChemicalFormula.ParseFormula("H") }, { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 { ProductType.M, new ChemicalFormula() } }; /// /// Returns mass shift by product type /// /// /// public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) { switch (fragmentType) { case ProductType.a: case ProductType.aWaterLoss: case ProductType.aBaseLoss: case ProductType.b: case ProductType.bWaterLoss: case ProductType.bBaseLoss: case ProductType.c: case ProductType.cWaterLoss: case ProductType.cBaseLoss: case ProductType.d: case ProductType.dWaterLoss: case ProductType.dBaseLoss: return FragmentationTerminus.FivePrime; case ProductType.w: case ProductType.wWaterLoss: case ProductType.wBaseLoss: case ProductType.x: case ProductType.xWaterLoss: case ProductType.xBaseLoss: case ProductType.y: case ProductType.yWaterLoss: case ProductType.yBaseLoss: case ProductType.z: case ProductType.zWaterLoss: case ProductType.zBaseLoss: return FragmentationTerminus.ThreePrime; case ProductType.M: return FragmentationTerminus.None; case ProductType.aStar: case ProductType.aDegree: case ProductType.bAmmoniaLoss: case ProductType.yAmmoniaLoss: case ProductType.zPlusOne: case ProductType.D: case ProductType.Ycore: case ProductType.Y: default: throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); } } /// /// Product ion types by Fragmentation Terminus /// private static readonly Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> { { FragmentationTerminus.FivePrime, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, } }, { FragmentationTerminus.ThreePrime, new List { ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, } }, { FragmentationTerminus.Both, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, ProductType.M } } }; public static List GetRnaTerminusSpecificProductTypes( this FragmentationTerminus fragmentationTerminus) { return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; } /// /// Returns all product ion types based upon specified terminus /// /// /// /// public static List GetRnaTerminusSpecificProductTypesFromDissociation( this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) { var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); return terminusSpecific.Intersect(dissociationSpecific).ToList(); } } } \ No newline at end of file +using Chemistry; +using MassSpectrometry; + +namespace Omics.Fragmentation.Oligo +{ + /// + /// Methods dealing with specific product type for RNA molecules + /// + public static class DissociationTypeCollection + { + /// + /// Product Ion types by dissociation method + /// + /// + /// HCD ions were taken from the following paper: https://www.nature.com/articles/s41598-023-36193-2 + /// Ion types below here should be validated with experimental results. + /// Base and water losses occur very frequently and may also be present in these activation types. + /// CID, UVPD, and aEPD ions were taken from the following paper: https://pubs.acs.org/doi/10.1021/acs.analchem.3c05428?ref=PDF + /// NETD ions were taken from the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7161943/ + /// lowCID ions were taken from this Thermo Poster: https://assets.thermofisher.com/TFS-Assets/CMD/Flyers/fl-489263-asms23-optimized-fragmentation-oligonucleotides-suppresses-undesired-fragmentation-fl489263-en.pdf + /// + public static Dictionary> ProductsFromDissociationType = + new Dictionary>() + { + { DissociationType.Unknown, new List() }, + { DissociationType.Custom, new List() }, + { + DissociationType.AnyActivationType, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.aWaterLoss, + ProductType.b, ProductType.bBaseLoss, ProductType.bWaterLoss, + ProductType.c, ProductType.cBaseLoss, ProductType.cWaterLoss, + ProductType.d, ProductType.dBaseLoss, ProductType.dWaterLoss, + ProductType.w, ProductType.wBaseLoss, ProductType.wWaterLoss, + ProductType.x, ProductType.xBaseLoss, ProductType.xWaterLoss, + ProductType.y, ProductType.yBaseLoss, ProductType.yWaterLoss, + ProductType.z, ProductType.zBaseLoss, ProductType.zWaterLoss, + ProductType.M + } + }, + { + DissociationType.CID, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, + ProductType.y, ProductType.yWaterLoss, ProductType.M + } + }, + { + DissociationType.HCD, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, + ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, + ProductType.M + } + }, + { + DissociationType.UVPD, new List + { + ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.M + } + }, + { + DissociationType.aEPD, new List + { + ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.x, ProductType.z, ProductType.M + } + }, + { + DissociationType.NETD, new List + { + ProductType.w, ProductType.d, ProductType.M + } + }, + { + DissociationType.LowCID, new List() + { + ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, + ProductType.y, ProductType.yWaterLoss, ProductType.M + } + }, + { DissociationType.IRMPD, new List() { } }, + { DissociationType.ECD, new List { } }, + { DissociationType.PQD, new List { } }, + { DissociationType.ETD, new List { } }, + { DissociationType.EThcD, new List { } }, + }; + + /// + /// Returns all dissociation types with implemented product type collections + /// + public static IEnumerable AllImplementedDissociationTypes => + ProductsFromDissociationType.Where(p => p.Value.Any()) + .Select(p => p.Key); + + /// + /// Returns list of products types based upon the dissociation type + /// + /// + /// + public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => + ProductsFromDissociationType[dissociationType]; + + /// + /// Returns mass shift by product type + /// + /// + /// + public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; + + /// + /// Mass to be added or subtracted + /// + private static readonly Dictionary FragmentIonCaps = + new Dictionary + { + { ProductType.a, ChemicalFormula.ParseFormula("H") }, + { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, + { ProductType.b, ChemicalFormula.ParseFormula("OH") }, + { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, + { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, + { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, + { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, + { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, + + { ProductType.w, ChemicalFormula.ParseFormula("H") }, + { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, + { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, + { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, + { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, + { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, + { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, + { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, + //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed + { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H + { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H + { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H + { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H + + { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H + { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H + { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H + { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 + + { ProductType.M, new ChemicalFormula() } + }; + + /// + /// Returns all product ion types based upon specified terminus + /// + /// + /// + /// + public static List GetRnaTerminusSpecificProductTypesFromDissociation( + this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) + { + var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); + var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); + return terminusSpecific.Intersect(dissociationSpecific).ToList(); + } + } +} diff --git a/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs b/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs new file mode 100644 index 000000000..0ec5541cd --- /dev/null +++ b/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs @@ -0,0 +1,141 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Omics.Fragmentation.Oligo +{ + public static class TerminusSpecificProductTypes + { + public static List GetRnaTerminusSpecificProductTypes( + this FragmentationTerminus fragmentationTerminus) + { + return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; + } + + /// + /// The types of ions that can be generated from an oligo fragment, based on the terminus of the fragment + /// + public static Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> + { + { + FragmentationTerminus.FivePrime, new List + { + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + } + }, + { + FragmentationTerminus.ThreePrime, new List + { + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + } + }, + { + FragmentationTerminus.Both, new List + { + + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + ProductType.M + } + + }, + { + FragmentationTerminus.None, new List() + } + }; + + + public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) + { + switch (fragmentType) + { + case ProductType.a: + case ProductType.aWaterLoss: + case ProductType.aBaseLoss: + case ProductType.b: + case ProductType.bWaterLoss: + case ProductType.bBaseLoss: + case ProductType.c: + case ProductType.cWaterLoss: + case ProductType.cBaseLoss: + case ProductType.d: + case ProductType.dWaterLoss: + case ProductType.dBaseLoss: + case ProductType.w: + case ProductType.wWaterLoss: + case ProductType.wBaseLoss: + case ProductType.x: + case ProductType.xWaterLoss: + case ProductType.xBaseLoss: + case ProductType.y: + case ProductType.yWaterLoss: + case ProductType.yBaseLoss: + case ProductType.z: + case ProductType.zWaterLoss: + case ProductType.zBaseLoss: + case ProductType.M: + return ProductTypeToFragmentationTerminus[fragmentType]; + + case ProductType.aStar: + case ProductType.aDegree: + case ProductType.bAmmoniaLoss: + case ProductType.yAmmoniaLoss: + case ProductType.zPlusOne: + case ProductType.D: + case ProductType.Ycore: + case ProductType.Y: + default: + throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); + } + } + + + /// + /// The terminus of the oligo fragment that the product ion is generated from + /// + public static Dictionary ProductTypeToFragmentationTerminus = new Dictionary + { + { ProductType.a, FragmentationTerminus.FivePrime }, + { ProductType.aWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.aBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.b, FragmentationTerminus.FivePrime }, + { ProductType.bWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.bBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.c, FragmentationTerminus.FivePrime }, + { ProductType.cWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.cBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.d, FragmentationTerminus.FivePrime }, + { ProductType.dWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.dBaseLoss, FragmentationTerminus.FivePrime }, + + { ProductType.w, FragmentationTerminus.ThreePrime }, + { ProductType.wWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.wBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.x, FragmentationTerminus.ThreePrime }, + { ProductType.xWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.xBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.y, FragmentationTerminus.ThreePrime }, + { ProductType.yWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.yBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.z, FragmentationTerminus.ThreePrime }, + { ProductType.zWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.zBaseLoss, FragmentationTerminus.ThreePrime }, + + { ProductType.M, FragmentationTerminus.Both } + }; + } +} diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs index 1c3ade66a..12989b1f3 100644 --- a/mzLib/Omics/IBioPolymerWithSetMods.cs +++ b/mzLib/Omics/IBioPolymerWithSetMods.cs @@ -50,7 +50,16 @@ public void Fragment(DissociationType dissociationType, FragmentationTerminus fr public void FragmentInternally(DissociationType dissociationType, int minLengthOfFragments, List products); - public IBioPolymerWithSetMods Localize(int j, double massToLocalize); + /// + /// Outputs a duplicate IBioPolymerWithSetMods with a localized mass shift, replacing a modification when present + /// + /// Used to localize an unknown mass shift in the MetaMorpheus Localization Engine + /// + /// + /// The index of the modification in the AllModOneIsNTerminus Dictionary - 2 (idk why -2) + /// The mass to add to the BioPolymer + /// + public IBioPolymerWithSetMods Localize(int indexOfMass, double massToLocalize); public static string GetBaseSequenceFromFullSequence(string fullSequence) { @@ -73,5 +82,86 @@ public static string GetBaseSequenceFromFullSequence(string fullSequence) } return sb.ToString(); } + + /// + /// Returns a list of modifications and their OneBased index from a full sequence + /// + /// Full sequence + /// All known modifications + /// + /// When a full sequence is not in the correct format or a mod is not found in the allModsKnown dictionary + public static Dictionary GetModificationDictionaryFromFullSequence(string fullSequence, + Dictionary allModsKnown) + { + var allModsOneIsNterminus = new Dictionary(); + var baseSequence = GetBaseSequenceFromFullSequence(fullSequence); + int currentModStart = 0; + int currentModificationLocation = 1; + bool currentlyReadingMod = false; + int bracketCount = 0; + + for (int r = 0; r < fullSequence.Length; r++) + { + char c = fullSequence[r]; + if (c == '[') + { + currentlyReadingMod = true; + if (bracketCount == 0) + { + currentModStart = r + 1; + } + bracketCount++; + } + else if (c == ']') + { + string modId = null; + bracketCount--; + if (bracketCount == 0) + { + try + { + //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") + string modString = fullSequence.Substring(currentModStart, r - currentModStart); + int splitIndex = modString.IndexOf(':'); + string modType = modString.Substring(0, splitIndex); + modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); + } + catch (Exception e) + { + throw new MzLibUtil.MzLibException( + "Error while trying to parse string into peptide: " + e.Message, e); + + } + if (!allModsKnown.TryGetValue(modId, out var mod)) + { + throw new MzLibUtil.MzLibException( + "Could not find modification while reading string: " + fullSequence); + } + if (mod.LocationRestriction.Contains("C-terminal.") && r == fullSequence.Length - 1) + { + currentModificationLocation = baseSequence.Length + 2; + } + allModsOneIsNterminus.Add(currentModificationLocation, mod); + currentlyReadingMod = false; + } + } + else if (!currentlyReadingMod) + { + currentModificationLocation++; + } + //else do nothing + } + + return allModsOneIsNterminus; + } + + /// + /// Returns a list of modifications from a full sequence + /// + /// Full sequence + /// All known modifications + /// + public static List GetModificationsFromFullSequence(string fullSequence, + Dictionary allModsKnown) => [.. GetModificationDictionaryFromFullSequence(fullSequence, allModsKnown).Values]; } } diff --git a/mzLib/Omics/Omics.csproj b/mzLib/Omics/Omics.csproj index 6493de1a6..8283d5d7b 100644 --- a/mzLib/Omics/Omics.csproj +++ b/mzLib/Omics/Omics.csproj @@ -1,12 +1,16 @@  - net6.0 + net8.0 x64 enable enable + + + + diff --git a/mzLib/Omics/SpectrumMatch/LibrarySpectrum.cs b/mzLib/Omics/SpectrumMatch/LibrarySpectrum.cs index f4e77de77..f68f4ef4c 100644 --- a/mzLib/Omics/SpectrumMatch/LibrarySpectrum.cs +++ b/mzLib/Omics/SpectrumMatch/LibrarySpectrum.cs @@ -52,7 +52,7 @@ public string CalculateSpectralAngleOnTheFly(List spectrumMa MatchedFragmentIons.Select(f => f.Intensity).ToArray(), SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, toleranceInPpm: 20, - keepAllExperimentalPeaks: true); + allPeaks: true); double? spectralContrastAngle = spectraComparison.SpectralContrastAngle(); return spectralContrastAngle == null diff --git a/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs b/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs index 1d7f1b231..1abb40e99 100644 --- a/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs +++ b/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs @@ -1103,7 +1103,7 @@ private void ParseSequence(string sequence) { modification = new OldSchoolChemicalFormulaModification(ChemicalFormula.ParseFormula(modString)); } - catch (MzLibException) + catch (MzLibException e) { if (double.TryParse(modString, out double mass)) { @@ -1111,7 +1111,7 @@ private void ParseSequence(string sequence) } else { - throw new MzLibException("Unable to correctly parse the following modification: " + modString); + throw new MzLibException("Unable to correctly parse the following modification: " + modString, e); } } diff --git a/mzLib/Proteomics/PSM/PsmFromTsv.cs b/mzLib/Proteomics/PSM/PsmFromTsv.cs index 95605ab49..5837c745d 100644 --- a/mzLib/Proteomics/PSM/PsmFromTsv.cs +++ b/mzLib/Proteomics/PSM/PsmFromTsv.cs @@ -259,6 +259,5 @@ public PsmFromTsv(PsmFromTsv psm, string fullSequence, int index = 0, string bas LocalizedGlycan = psm.LocalizedGlycan; } - } } diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index 86e51b54f..fc07460d2 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -7,6 +7,8 @@ using Omics.Digestion; using Omics.Fragmentation; using Omics.Modifications; +using MzLibUtil; +using Easy.Common.Extensions; namespace Proteomics { @@ -76,14 +78,14 @@ public Protein(string sequence, string accession, string organism = null, List /// Protein construction that clones a protein but assigns a different base sequence - /// For use in SILAC experiments + /// For use in SILAC experiments and in decoy construction /// /// - /// + /// /// - public Protein(Protein originalProtein, string silacSequence) + public Protein(Protein originalProtein, string newBaseSequence) { - BaseSequence = silacSequence; + BaseSequence = newBaseSequence; Accession = originalProtein.Accession; NonVariantProtein = originalProtein.NonVariantProtein; Name = originalProtein.Name; @@ -156,7 +158,7 @@ public Protein(string variantBaseSequence, Protein protein, IEnumerable /// Base sequence, which may contain applied sequence variations. /// - public string BaseSequence { get; } + public string BaseSequence { get; private set; } public string Organism { get; } public bool IsDecoy { get; } @@ -206,6 +208,7 @@ public string GetUniProtFastaHeader() { var n = GeneNames.FirstOrDefault(); string geneName = n == null ? "" : n.Item2; + return string.Format("mz|{0}|{1} {2} OS={3} GN={4}", Accession, Name, FullName, Organism, geneName); } @@ -801,6 +804,162 @@ private static string GetName(IEnumerable appliedVariations, } } + /// + /// This function takes in a decoy protein and a list of forbidden sequences that the decoy + /// protein should not contain. Optionally, a list of the peptides within the base sequence + /// of the decoy protein that need to be scrambled can be passed as well. It will scramble the required sequences, + /// leaving cleavage sites intact. + /// + /// A Decoy protein to be cloned + /// Digestion parameters + /// A HashSet of forbidden sequences that the decoy protein should not contain. Typically, a set of target base sequences + /// Optional IEnumberable of sequences within the decoy protein that need to be replaced. + /// If this is passed, only sequences within the IEnumerable will be replaced!!! + /// A cloned copy of the decoy protein with a scrambled sequence + public static Protein ScrambleDecoyProteinSequence( + Protein originalDecoyProtein, + DigestionParams digestionParams, + HashSet forbiddenSequences, + IEnumerable sequencesToScramble = null) + { + // If no sequencesToScramble are passed in, we check to see if any + // peptides in the decoy are forbidden sequences + sequencesToScramble = sequencesToScramble ?? originalDecoyProtein + .Digest(digestionParams, new List(), new List()) + .Select(pep => pep.FullSequence) + .Where(forbiddenSequences.Contains); + if(sequencesToScramble.Count() == 0) + { + return originalDecoyProtein; + } + + string scrambledProteinSequence = originalDecoyProtein.BaseSequence; + // Clone the original protein's modifications + var scrambledModificationDictionary = originalDecoyProtein.OriginalNonVariantModifications.ToDictionary(kvp => kvp.Key, kvp => kvp.Value); + Random rng = new Random(42); + + // Start small and then go big. If we scramble a zero-missed cleavage peptide, but the missed cleavage peptide contains the previously scrambled peptide + // Then we can avoid unnecessary operations as the scrambledProteinSequence will no longer contain the longer sequence of the missed cleavage peptide + foreach(string peptideSequence in sequencesToScramble.OrderBy(seq => seq.Length)) + { + if(scrambledProteinSequence.Contains(peptideSequence)) + { + string scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng, + out var swappedArray); + int scrambleAttempts = 1; + + // Try five times to scramble the peptide sequence without creating a forbidden sequence + while(forbiddenSequences.Contains(scrambledPeptideSequence) & scrambleAttempts <= 5) + { + scrambledPeptideSequence = ScrambleSequence(peptideSequence, digestionParams.DigestionAgent.DigestionMotifs, rng, + out swappedArray); + scrambleAttempts++; + } + + scrambledProteinSequence = scrambledProteinSequence.Replace(peptideSequence, scrambledPeptideSequence); + + if (!scrambledModificationDictionary.Any()) continue; + + // rearrange the modifications + foreach (int index in scrambledProteinSequence.IndexOfAll(scrambledPeptideSequence)) + { + // Get mods that were affected by the scramble + var relevantMods = scrambledModificationDictionary.Where(kvp => + kvp.Key >= index + 1 && kvp.Key < index + peptideSequence.Length + 1).ToList(); + + // Modify the dictionary to reflect the new positions of the modifications + foreach (var kvp in relevantMods) + { + int newKey = swappedArray[kvp.Key - 1 - index] + 1 + index; + // To prevent collisions, we have to check if mods already exist at the new idx. + if(scrambledModificationDictionary.TryGetValue(newKey, out var modsToSwap)) + { + // If there are mods at the new idx, we swap the mods + scrambledModificationDictionary[newKey] = kvp.Value; + scrambledModificationDictionary[kvp.Key] = modsToSwap; + } + else + { + scrambledModificationDictionary.Add(newKey, kvp.Value); + scrambledModificationDictionary.Remove(kvp.Key); + } + } + } + } + } + + Protein newProtein = new Protein(originalDecoyProtein, scrambledProteinSequence); + + // Update the modifications using the scrambledModificationDictionary + newProtein.OriginalNonVariantModifications = scrambledModificationDictionary; + newProtein.OneBasedPossibleLocalizedModifications = newProtein.SelectValidOneBaseMods(scrambledModificationDictionary); + + return newProtein; + } + + /// + /// Scrambles a peptide sequence, preserving the position of any cleavage sites. + /// + /// An array that maps the previous position (index) to the new position (value) + public static string ScrambleSequence(string sequence, List motifs, Random rng, out int[] swappedPositionArray) + { + // First, find the location of every cleavage motif. These sites shouldn't be scrambled. + HashSet zeroBasedCleavageSitesLocations = new(); + foreach (var motif in motifs) + { + for (int i = 0; i < sequence.Length; i++) + { + (bool fits, bool prevents) = motif.Fits(sequence, i); + if (fits && !prevents) + { + zeroBasedCleavageSitesLocations.Add(i); + } + } + } + + // Next, scramble the sequence using the Fisher-Yates shuffle algorithm. + char[] sequenceArray = sequence.ToCharArray(); + // We're going to keep track of the positions of the characters in the original sequence, + // This will enable us to adjust the location of modifications that are present in the original sequence + // to the new scrambled sequence. + int[] tempPositionArray = Enumerable.Range(0, sequenceArray.Length).ToArray(); + int n = sequenceArray.Length; + while(n > 1) + { + n--; + if(zeroBasedCleavageSitesLocations.Contains(n)) + { + // Leave the cleavage site in place + continue; + } + int k = rng.Next(n + 1); + // don't swap the position of a cleavage site + while(zeroBasedCleavageSitesLocations.Contains(k)) + { + k = rng.Next(n + 1); + } + + // rearrange the sequence array + char tempResidue = sequenceArray[k]; + sequenceArray[k] = sequenceArray[n]; + sequenceArray[n] = tempResidue; + + // update the position array to represent the swaps + int tempPosition = tempPositionArray[k]; + tempPositionArray[k] = tempPositionArray[n]; + tempPositionArray[n] = tempPosition; + } + + // This maps the previous position (index) to the new position (value) + swappedPositionArray = new int[tempPositionArray.Length]; + for (int i = 0; i < tempPositionArray.Length; i++) + { + swappedPositionArray[tempPositionArray[i]] = i; + } + + return new string(sequenceArray); + } + public int CompareTo(Protein other) { //permits sorting of proteins diff --git a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs index 067aff6d7..aafec0a5e 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -18,10 +18,10 @@ public class PeptideWithSetModifications : ProteolyticPeptide, IBioPolymerWithSe { public string FullSequence { get; private set; } //sequence with modifications public int NumFixedMods { get; } - // Parameter to store a hash code corresponding to a Decoy or a Target peptide + // Parameter to store the full sequence of the corresponding Target or Decoy peptide // If the peptide in question is a decoy, this pairs it to the target it was generated from // If the peptide in question is a target, this pairs it to its corresponding decoy - public int? PairedTargetDecoyHash { get; private set; } + public string PairedTargetDecoySequence { get; private set; } /// /// Dictionary of modifications on the peptide. The N terminus is index 1. /// The key indicates which residue modification is on (with 1 being N terminus). @@ -40,7 +40,7 @@ public class PeptideWithSetModifications : ProteolyticPeptide, IBioPolymerWithSe /// public PeptideWithSetModifications(Protein protein, IDigestionParams digestionParams, int oneBasedStartResidueInProtein, int oneBasedEndResidueInProtein, CleavageSpecificity cleavageSpecificity, string peptideDescription, int missedCleavages, - Dictionary allModsOneIsNterminus, int numFixedMods, string baseSequence = null, int? pairedTargetDecoyHash = null) + Dictionary allModsOneIsNterminus, int numFixedMods, string baseSequence = null, string pairedTargetDecoySequence = null) : base(protein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificity, peptideDescription, baseSequence) { _allModsOneIsNterminus = allModsOneIsNterminus; @@ -49,7 +49,7 @@ public PeptideWithSetModifications(Protein protein, IDigestionParams digestionPa FullSequence = this.DetermineFullSequence(); ProteinAccession = protein.Accession; UpdateCleavageSpecificity(); - PairedTargetDecoyHash = pairedTargetDecoyHash; // Added PairedTargetDecoyHash as a nullable integer + PairedTargetDecoySequence = pairedTargetDecoySequence; } /// @@ -59,7 +59,7 @@ public PeptideWithSetModifications(Protein protein, IDigestionParams digestionPa public PeptideWithSetModifications(string sequence, Dictionary allKnownMods, int numFixedMods = 0, IDigestionParams digestionParams = null, Protein p = null, int oneBasedStartResidueInProtein = int.MinValue, int oneBasedEndResidueInProtein = int.MinValue, int missedCleavages = int.MinValue, - CleavageSpecificity cleavageSpecificity = CleavageSpecificity.Full, string peptideDescription = null, int? pairedTargetDecoyHash = null) + CleavageSpecificity cleavageSpecificity = CleavageSpecificity.Full, string peptideDescription = null, string pairedTargetDecoySequence = null) : base(p, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificity, peptideDescription) { if (sequence.Contains("|")) @@ -69,10 +69,10 @@ public PeptideWithSetModifications(string sequence, Dictionary(AllModsOneIsNterminus); double massOfExistingMod = 0; - if (dictWithLocalizedMass.TryGetValue(j + 2, out Modification modToReplace)) + if (dictWithLocalizedMass.TryGetValue(indexOfMass + 2, out Modification modToReplace)) { massOfExistingMod = (double)modToReplace.MonoisotopicMass; - dictWithLocalizedMass.Remove(j + 2); + dictWithLocalizedMass.Remove(indexOfMass + 2); } - dictWithLocalizedMass.Add(j + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); + dictWithLocalizedMass.Add(indexOfMass + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); var peptideWithLocalizedMass = new PeptideWithSetModifications(Protein, _digestionParams, OneBasedStartResidueInProtein, OneBasedEndResidueInProtein, CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, dictWithLocalizedMass, NumFixedMods); @@ -902,14 +902,7 @@ public override bool Equals(object obj) public override int GetHashCode() { - if (DigestionParams == null) - { - return FullSequence.GetHashCode(); - } - else - { - return FullSequence.GetHashCode() + DigestionParams.DigestionAgent.GetHashCode(); - } + return FullSequence.GetHashCode(); } /// @@ -917,7 +910,7 @@ public override int GetHashCode() /// public void SetNonSerializedPeptideInfo(Dictionary idToMod, Dictionary accessionToProtein, DigestionParams dp) { - GetModsAfterDeserialization(idToMod); + _allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(FullSequence, idToMod); GetProteinAfterDeserialization(accessionToProtein); _digestionParams = dp; } @@ -926,66 +919,6 @@ public void SetNonSerializedPeptideInfo(Dictionary idToMod Dictionary accessionToProtein, IDigestionParams dp) => SetNonSerializedPeptideInfo(idToMod, accessionToProtein, (DigestionParams)dp); - private void GetModsAfterDeserialization(Dictionary idToMod) - { - _allModsOneIsNterminus = new Dictionary(); - int currentModStart = 0; - int currentModificationLocation = 1; - bool currentlyReadingMod = false; - int bracketCount = 0; - - for (int r = 0; r < FullSequence.Length; r++) - { - char c = FullSequence[r]; - if (c == '[') - { - currentlyReadingMod = true; - if (bracketCount == 0) - { - currentModStart = r + 1; - } - bracketCount++; - } - else if (c == ']') - { - string modId = null; - bracketCount--; - if (bracketCount == 0) - { - try - { - //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") - string modString = FullSequence.Substring(currentModStart, r - currentModStart); - int splitIndex = modString.IndexOf(':'); - string modType = modString.Substring(0, splitIndex); - modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); - } - catch (Exception e) - { - throw new MzLibUtil.MzLibException( - "Error while trying to parse string into peptide: " + e.Message); - } - if (!idToMod.TryGetValue(modId, out Modification mod)) - { - throw new MzLibUtil.MzLibException( - "Could not find modification while reading string: " + FullSequence); - } - if (mod.LocationRestriction.Contains("C-terminal.") && r == FullSequence.Length - 1) - { - currentModificationLocation = BaseSequence.Length + 2; - } - _allModsOneIsNterminus.Add(currentModificationLocation, mod); - currentlyReadingMod = false; - } - } - else if (!currentlyReadingMod) - { - currentModificationLocation++; - } - //else do nothing - } - } - private void GetProteinAfterDeserialization(Dictionary idToProtein) { Protein protein = null; @@ -1140,17 +1073,15 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true); DigestionParams d = _digestionParams; - // Creates a hash code corresponding to the target's sequence - int targetHash = GetHashCode(); PeptideWithSetModifications decoyPeptide; //Make the "peptideDescription" store the corresponding target's sequence if (newBaseString != this.BaseSequence) { decoyPeptide = new PeptideWithSetModifications(decoyProtein, d, this.OneBasedStartResidueInProtein, this.OneBasedEndResidueInProtein, this.CleavageSpecificityForFdrCategory, this.FullSequence, this.MissedCleavages, newModificationsDictionary, this.NumFixedMods, newBaseString); // Sets PairedTargetDecoyHash of the original target peptie to the hash hode of the decoy sequence - PairedTargetDecoyHash = decoyPeptide.GetHashCode(); + PairedTargetDecoySequence = decoyPeptide.FullSequence; // Sets PairedTargetDecoyHash of the decoy peptide to the hash code of the target sequence - decoyPeptide.PairedTargetDecoyHash = targetHash; + decoyPeptide.PairedTargetDecoySequence = this.FullSequence; return decoyPeptide; } @@ -1158,9 +1089,9 @@ public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoA { //The reverse decoy procedure failed to create a PeptideWithSetModificatons with a different sequence. Therefore, //we retrun the mirror image peptide. - decoyPeptide = this.GetPeptideMirror(revisedAminoAcidOrder); - PairedTargetDecoyHash = decoyPeptide.GetHashCode(); - decoyPeptide.PairedTargetDecoyHash = targetHash; + decoyPeptide = this.GetPeptideMirror(revisedAminoAcidOrder); + PairedTargetDecoySequence = decoyPeptide.FullSequence; + decoyPeptide.PairedTargetDecoySequence = this.FullSequence; return decoyPeptide; } @@ -1318,17 +1249,15 @@ public PeptideWithSetModifications GetScrambledDecoyFromTarget(int[] revisedAmin Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true); DigestionParams d = _digestionParams; - // Creates a hash code corresponding to the target's sequence - int targetHash = GetHashCode(); PeptideWithSetModifications decoyPeptide; //Make the "peptideDescription" store the corresponding target's sequence if (newBaseString != this.BaseSequence) { decoyPeptide = new PeptideWithSetModifications(decoyProtein, d, this.OneBasedStartResidueInProtein, this.OneBasedEndResidueInProtein, this.CleavageSpecificityForFdrCategory, this.FullSequence, this.MissedCleavages, newModificationsDictionary, this.NumFixedMods, newBaseString); // Sets PairedTargetDecoyHash of the original target peptie to the hash hode of the decoy sequence - PairedTargetDecoyHash = decoyPeptide.GetHashCode(); + PairedTargetDecoySequence = decoyPeptide.FullSequence; // Sets PairedTargetDecoyHash of the decoy peptide to the hash code of the target sequence - decoyPeptide.PairedTargetDecoyHash = targetHash; + decoyPeptide.PairedTargetDecoySequence = this.FullSequence; return decoyPeptide; } @@ -1337,8 +1266,8 @@ public PeptideWithSetModifications GetScrambledDecoyFromTarget(int[] revisedAmin //The reverse decoy procedure failed to create a PeptideWithSetModificatons with a different sequence. Therefore, //we retrun the mirror image peptide. decoyPeptide = this.GetPeptideMirror(revisedAminoAcidOrder); - PairedTargetDecoyHash = decoyPeptide.GetHashCode(); - decoyPeptide.PairedTargetDecoyHash = targetHash; + PairedTargetDecoySequence = decoyPeptide.FullSequence; + decoyPeptide.PairedTargetDecoySequence = this.FullSequence; return decoyPeptide; } } diff --git a/mzLib/Proteomics/Proteomics.csproj b/mzLib/Proteomics/Proteomics.csproj index a87d3a3a9..998b66e50 100644 --- a/mzLib/Proteomics/Proteomics.csproj +++ b/mzLib/Proteomics/Proteomics.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 x64 @@ -28,4 +28,8 @@ + + + + diff --git a/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs new file mode 100644 index 000000000..87100cfa5 --- /dev/null +++ b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableRecord.cs @@ -0,0 +1,55 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers.ExternalResults.BaseClasses +{ + /// + /// Defines the information needed to create the identification object usable by FlashLFQ + /// + public interface IQuantifiableRecord + { + /// + /// The file name of the MS Data file in which the identification was made + /// + public string FileName { get; } + + /// + /// A list of tuples, each of which represent a protein. + /// Each tuple contains the accession number, gene name, and organism associated with the given result. + /// + public List<(string proteinAccessions, string geneName, string organism)> ProteinGroupInfos { get; } + + /// + /// The amino acid sequence of the identified peptide + /// + public string BaseSequence { get; } + + /// + /// The amino acid sequence and the associated post-translation modifications of the identified peptide + /// + public string ModifiedSequence { get; } + + /// + /// The retention time (in minutes) associated with the result + /// + public double RetentionTime { get; } + + /// + /// The charge state associated with the result + /// + public int ChargeState { get; } + + /// + /// Defines whether or not the result is a decoy identification + /// + public bool IsDecoy { get; } + + /// + /// The mass of the monoisotopic peptide (i.e., no c13 or n15 atoms are present, the lowest possible mass) + /// + public double MonoisotopicMass { get; } + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs new file mode 100644 index 000000000..d181e399e --- /dev/null +++ b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers.ExternalResults.BaseClasses +{ + /// + /// Outlines behavior to turn results into an IEnumerable of IQuantifiableRecords + /// and to create the dictionary linking file names from the external result files + /// to their local file paths which are used to make the identification object + /// + public interface IQuantifiableResultFile : IResultFile + { + /// + /// Returns every result in the result file as an IQuantifiableRecord + /// + /// Enumerable that contains identifications for a peptide + public IEnumerable GetQuantifiableResults(); + + /// + /// Links the file name associated with the protein to the raw file path of MassSpec data + /// + /// list of file paths associated with each distinct record + /// Dictionary of file names and their associted full paths + public Dictionary FileNameToFilePath(List fullFilePath); + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/CruxResult.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/CruxResult.cs new file mode 100644 index 000000000..f13392918 --- /dev/null +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/CruxResult.cs @@ -0,0 +1,91 @@ +using System.Globalization; +using System.Text; +using CsvHelper.Configuration; +using CsvHelper.Configuration.Attributes; +using MzLibUtil; + +namespace Readers +{ + public class CruxResult + { + public static CsvConfiguration CsvConfiguration => new CsvConfiguration(CultureInfo.InvariantCulture) + { + Encoding = Encoding.UTF8, + HasHeaderRecord = true, + Delimiter = "\t", + }; + + [Name("file")] + public string FilePath { get; set; } + + [Name("scan")] + public int OneBasedScanNumber { get; set; } + + [Name("charge")] + public int Charge { get; set; } + + [Name("retention time")] + public double RetentionTime { get; set; } + + [Name("spectrum precursor m/z")] + public double PrecursorMz { get; set; } + + [Name("spectrum neutral mass")] + public double NeutralMass { get; set; } + + [Name("peptide mass")] + public double PeptideMass { get; set; } + + [Name("delta_cn")] + public double DeltaCn { get; set; } + + [Name("xcorr score")] + public double XCorrScore { get; set; } + + [Name("xcorr rank")] + public int XCorrRank { get; set; } + + [Name("tailor score")] + public double TailorScore { get; set; } + + [Name("tdc q-value")] + public double TdcQValue { get; set; } + + [Name("b/y ions matched")] + public int BAndYIonsMatched { get; set; } + + [Name("b/y ions total")] + public int BAndYIonsTotal { get; set; } + + [Name("b/y ions fraction")] + public double BAndYIonsFraction { get; set; } + + [Name("b/y ion repeat match")] + public int BAndYIonRepeatMatch { get; set; } + + [Name("distinct matches/spectrum")] + public int DistinctMatchesPerSpectrum { get; set; } + + [Name("sequence")] + public string FullSequence { get; set; } + + [Name("unmodified sequence")] + public string BaseSequence { get; set; } + + [Name("protein id")] + public string ProteinId { get; set; } + + [Name("flanking aa")] + public string FlankingAa { get; set; } + + #region Interpreted properties + + [Ignore] private string? _fileNameWithoutExtension = null; + [Ignore] public string FileNameWithoutExtension => _fileNameWithoutExtension ??= FilePath.GetPeriodTolerantFilenameWithoutExtension(); + + [Ignore] private string? _accession = null; + [Ignore] public string Accession => _accession ??= ProteinId.Split('|')[1].Trim(); + + #endregion + } +} diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/ExperimentAnnotation.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/ExperimentAnnotation.cs new file mode 100644 index 000000000..93b1192b8 --- /dev/null +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/ExperimentAnnotation.cs @@ -0,0 +1,44 @@ +using CsvHelper.Configuration; +using CsvHelper.Configuration.Attributes; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers +{ + /// + /// A class representing a single entry in an experiment_annotation.tsv file + /// + public class ExperimentAnnotation + { + public static CsvConfiguration CsvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture) + { + Delimiter = "\t", + HasHeaderRecord = true, + IgnoreBlankLines = true, + TrimOptions = TrimOptions.Trim + }; + + #region experiment_annotation Fields + + [Name("file")] + public string File { get; set; } + + [Name("sample")] + public string Sample { get; set; } + + [Name("sample_name")] + public string SampleName { get; set; } + + [Name("condition")] + public string Condition { get; set; } + + [Name("replicate")] + public string Replicate { get; set; } + + #endregion + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs index 3dd5b1976..54dc19987 100644 --- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs @@ -14,10 +14,14 @@ using Proteomics; using static System.Net.Mime.MediaTypeNames; using ThermoFisher.CommonCore.Data.Interfaces; +using Readers.ExternalResults.BaseClasses; +using System.Reflection.Metadata.Ecma335; +using System.Runtime.CompilerServices; +using Easy.Common.Extensions; namespace Readers { - public class MsFraggerPsm + public class MsFraggerPsm : IQuantifiableRecord { public static CsvConfiguration CsvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture) { @@ -59,7 +63,7 @@ public class MsFraggerPsm [Name("Retention")] public double RetentionTime { get; set; } - + [Name("Observed Mass")] public double ObservedMass { get; set; } @@ -90,7 +94,11 @@ public class MsFraggerPsm [Name("Nextscore")] public double NextScore { get; set; } - [Name("PeptideProphet Probability")] + /// + /// MsFragger v22.0 output renames the header "PeptideProphet Probability" as just "Probability". + /// Headers are mutually exclusive, will not both occur in the same file. + /// + [Name("PeptideProphet Probability", "Probability")] public double PeptideProphetProbability { get; set; } [Name("Number of Enzymatic Termini")] @@ -155,5 +163,78 @@ public class MsFraggerPsm public int OneBasedScanNumber => _oneBasedScanNumber ??= int.Parse(Spectrum.Split('.')[1]); #endregion + + #region IQuantifiableRecord Implementation + + [Ignore] public string FileName => SpectrumFilePath; + + [Ignore] public List<(string, string, string)> ProteinGroupInfos + { + get + { + _proteinGroupInfos ??= AddProteinGroupInfos(); + return _proteinGroupInfos; + } + } + + /// + /// Creates a list of tuples, each of which represents a protein. + /// Each tuple contains the accession number, gene name, and organism. + /// These parameters are used to create a ProteinGroup object, + /// which is needed to make an identification. + /// + /// + private List<(string, string, string)> AddProteinGroupInfos () + { + _proteinGroupInfos = new List<(string, string, string)> (); + string protein = Protein; + + char[] delimiterChars = { '|', '_'}; + string[] proteinInfo = protein.Split(delimiterChars); + + string proteinAccessions; + string geneName; + string organism; + + // Fasta header is parsed to separate the accession number, gene name, and organism. + // If the protein does not have this information, it will be assigned an empty string. + // Ideally, a future refactor would create a method for parsing fasta headers + // that is shared by Readers and UsefulProteomicsDatabases. + proteinAccessions = proteinInfo.Length >= 2 ? proteinInfo[1] : ""; + geneName = proteinInfo.Length >= 3 ? proteinInfo[2] : ""; + organism = proteinInfo.Length >= 4 ? proteinInfo[3] : ""; ; + + _proteinGroupInfos.Add((proteinAccessions, geneName, organism)); + + if (MappedProteins.IsNullOrEmpty()) return _proteinGroupInfos; + + string mappedProteins = MappedProteins; + string[] allMappedProteinInfo = mappedProteins.Split(','); + foreach (var singleMappedProteinInfo in allMappedProteinInfo) + { + string[] mappedProteinInfo = singleMappedProteinInfo.Split(delimiterChars); + + proteinAccessions = mappedProteinInfo.Length >= 2 ? mappedProteinInfo[1] : ""; + geneName = mappedProteinInfo.Length >= 3 ? mappedProteinInfo[2] : ""; + organism = mappedProteinInfo.Length >= 4 ? mappedProteinInfo[3] : ""; + + _proteinGroupInfos.Add((proteinAccessions, geneName, organism)); + } + + return _proteinGroupInfos; + } + + [Ignore] private List<(string, string, string)> _proteinGroupInfos; + + [Ignore] public string ModifiedSequence => FullSequence.IsNullOrEmpty() ? BaseSequence : FullSequence; + + [Ignore] public int ChargeState => Charge; + + // decoy reading isn't currently supported for MsFragger psms, this will be revisited later + [Ignore] public bool IsDecoy => false; + + [Ignore] public double MonoisotopicMass => CalculatedPeptideMass; + + #endregion } -} +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsPathFinderTResult.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsPathFinderTResult.cs new file mode 100644 index 000000000..9541babe4 --- /dev/null +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsPathFinderTResult.cs @@ -0,0 +1,101 @@ +using CsvHelper.Configuration.Attributes; +using CsvHelper.Configuration; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Chemistry; + +namespace Readers +{ + public class MsPathFinderTResult + { + public static CsvConfiguration CsvConfiguration { get; } = new CsvConfiguration(System.Globalization.CultureInfo.InvariantCulture) + { + Delimiter = "\t", + HasHeaderRecord = true, + IgnoreBlankLines = true, + TrimOptions = CsvHelper.Configuration.TrimOptions.InsideQuotes, + BadDataFound = null, + }; + + + [Name("Scan")] + public int OneBasedScanNumber { get; set; } + + [Name("Pre")] + public char PreviousResidue { get; set; } + + [Name("Sequence")] + public string BaseSequence { get; set; } + + [Name("Post")] + public char NextResidue { get; set; } + + [Name("Modifications")] + public string Modifications { get; set; } + + [Name("Composition")] + [TypeConverter(typeof(MsPathFinderTCompositionToChemicalFormulaConverter))] + public ChemicalFormula ChemicalFormula { get; set; } + + [Name("ProteinName")] + public string ProteinName { get; set; } + + [Name("ProteinDesc")] + public string ProteinDescription { get; set; } + + [Name("ProteinLength")] + public int Length { get; set; } + + [Name("Start")] + public int OneBasedStartResidue { get; set; } + + [Name("End")] + public int OneBasedEndResidue { get; set; } + + [Name("Charge")] + public int Charge { get; set; } + + [Name("MostAbundantIsotopeMz")] + public double MostAbundantIsotopeMz { get; set; } + + [Name("Mass")] + public double MonoisotopicMass { get; set; } + + [Name("Ms1Features")] + public int Ms1Features { get; set; } + + [Name("#MatchedFragments")] + public int NumberOfMatchedFragments { get; set; } + + [Name("Probability")] + public double Probability { get; set; } + + [Name("SpecEValue")] + public double SpecEValue { get; set; } + + [Name("EValue")] + public double EValue { get; set; } + + [Name("QValue")] + [Optional] + public double QValue { get; set; } + + [Name("PepQValue")] + [Optional] + public double PepQValue { get; set; } + + #region InterpretedFields + + [Ignore] private string _accession = null; + [Ignore] public string Accession => _accession ??= ProteinName.Split('|')[1].Trim(); + + [Ignore] private bool? _isDecoy = null; + [Ignore] public bool IsDecoy => _isDecoy ??= ProteinName.StartsWith("XXX"); + [Optional] public string FileNameWithoutExtension { get; set; } + + #endregion + } +} diff --git a/mzLib/Readers/ExternalResults/ResultFiles/CruxResultFile.cs b/mzLib/Readers/ExternalResults/ResultFiles/CruxResultFile.cs new file mode 100644 index 000000000..dff6143c4 --- /dev/null +++ b/mzLib/Readers/ExternalResults/ResultFiles/CruxResultFile.cs @@ -0,0 +1,35 @@ + +namespace Readers +{ + public class CruxResultFile : ResultFile, IResultFile + { + public override SupportedFileType FileType => SupportedFileType.CruxResult; + public override Software Software { get; set; } + + public CruxResultFile(string filePath) : base(filePath, Software.Crux) { } + + public CruxResultFile() : base() { } + + public override void LoadResults() + { + using var csv = new CsvHelper.CsvReader(new StreamReader(FilePath), CruxResult.CsvConfiguration); + Results = csv.GetRecords().ToList(); + } + + public override void WriteResults(string outputPath) + { + if (!CanRead(FilePath)) + outputPath += FileType.GetFileExtension(); + + using (var csv = new CsvHelper.CsvWriter(new StreamWriter(File.Create(outputPath)), CruxResult.CsvConfiguration)) + { + csv.WriteHeader(); + foreach (var result in Results) + { + csv.NextRecord(); + csv.WriteRecord(result); + } + } + } + } +} diff --git a/mzLib/Readers/ExternalResults/ResultFiles/ExperimentAnnotationFile.cs b/mzLib/Readers/ExternalResults/ResultFiles/ExperimentAnnotationFile.cs new file mode 100644 index 000000000..9a975b84d --- /dev/null +++ b/mzLib/Readers/ExternalResults/ResultFiles/ExperimentAnnotationFile.cs @@ -0,0 +1,54 @@ +using CsvHelper; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers +{ + /// + /// Concrete Product for reading and representing a experiment annotation file + /// + public class ExperimentAnnotationFile: ResultFile, IResultFile + { + public override SupportedFileType FileType => SupportedFileType.ExperimentAnnotation; + + public override Software Software { get; set; } + + public ExperimentAnnotationFile(string filePath) : base(filePath, Software.MsFragger) { } + + /// + /// Constructor used to initialize from the factory method + /// + public ExperimentAnnotationFile() : base() { } + + /// + /// Load Results to the Results List from the given filepath + /// + public override void LoadResults() + { + using var csv = new CsvReader(new StreamReader(FilePath), ExperimentAnnotation.CsvConfiguration); + Results = csv.GetRecords().ToList(); + } + + /// + /// Writes results to a specific output path + /// + /// destination path + public override void WriteResults(string outputPath) + { + if (!CanRead(outputPath)) + outputPath += FileType.GetFileExtension(); + + using var csv = new CsvWriter(new StreamWriter(File.Create(outputPath)), ExperimentAnnotation.CsvConfiguration); + + csv.WriteHeader(); + foreach (var result in Results) + { + csv.NextRecord(); + csv.WriteRecord(result); + } + } + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerCombinedResults.cs b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerCombinedResults.cs new file mode 100644 index 000000000..db00ed7b0 --- /dev/null +++ b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerCombinedResults.cs @@ -0,0 +1,171 @@ +using CsvHelper; +using Readers.ExternalResults.BaseClasses; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using System.IO; +using MathNet.Numerics; + +namespace Readers +{ + public class MsFraggerCombinedResults : ResultFile, IResultFile, IQuantifiableResultFile + { + #region Properties/Fields + + public string FullFolderPath => FilePath; // The full file path to the folder of MSFragger results + private List allPsmFilePaths; // List of the full file paths to the psm files of every sample + + // A list of all the MSFraggerPsmFile objects that correspond to each sample within an experiment + public List AllPsmFiles { get; private set; } + + // Contains descriptive information on every ms data file in the experiment (sample name, full path to the ms data file, etc.) + public ExperimentAnnotationFile ExperimentAnnotations { get; private set; } + + #endregion + + #region IResultFile Implementatation + + public override SupportedFileType FileType => SupportedFileType.MsFraggerPsm; + public override Software Software { get; set; } + public MsFraggerCombinedResults(string filePath) : base(filePath, Software.MsFragger) { } + + /// + /// Loads the results from each psm.tsv file in the results folder, builds one list of MsFraggerPsms, + /// and Calls LoadExperimentAnnotation, FindAllFilePaths, LoadPsmResults, + /// then selects every results from each MsFraggerPsmFile in AllPsmFiles and writes them to one concatenated list. + /// + public override void LoadResults() + { + LoadExperimentAnnotationResults(); + FindAllFilePaths(); + LoadPsmResults(); + + List concatList = new List(); + foreach (var file in AllPsmFiles) + { + concatList.AddRange(file); + } + + Results = concatList; + } + + public override void WriteResults(string outputPath) + { + throw new NotImplementedException("Method not yet implemented."); + } + + #endregion + + /// + /// Checks for existence of experiment annotation file and loads its it as an ExperimentAnnotationResultFile, + /// then sets the ExperimentAnnotations property + /// + /// + public void LoadExperimentAnnotationResults() + { + string combinedFilePath = Path.Combine(FullFolderPath, "experiment_annotation.tsv"); + if (!File.Exists(combinedFilePath)) { throw new FileNotFoundException("The experiment_annotation.tsv file was not found"); } + + ExperimentAnnotations = new ExperimentAnnotationFile(combinedFilePath); + } + + /// + /// For each path in AllPsmFilePaths, creates and loads an MsFraggerPsmFile. + /// Then constructs the AllPsmFiles list + /// + public void LoadPsmResults() + { + AllPsmFiles = new List(); + + foreach(var path in allPsmFilePaths) + { + MsFraggerPsmFile file = new MsFraggerPsmFile(path); + AllPsmFiles.Add(file); + } + } + + public IEnumerable GetQuantifiableResults() => Results; + + /// + /// Links the file name associated with the an IQuantifiableRecord + /// to the raw file path of MassSpec data in the fullFilePath list + /// + /// list of file paths associated with each distinct record + /// Dictionary of file names and their associted full paths + public Dictionary FileNameToFilePath(List filePaths) + { + Dictionary allFiles = new Dictionary(); + + allFiles = AllPsmFiles.Select(file => file.FileNameToFilePath(filePaths)) + .SelectMany(dictionary => dictionary) + .GroupBy(x => x.Key) + .Select(keyValuePair => keyValuePair.First()) + .ToDictionary(fileName => fileName.Key, filePath => filePath.Value); + + return allFiles; + } + + /// + /// Links the file name associated with IQuantifiableRecord to the raw file path pf MassSpec file + /// using the full file paths from the experiment annotation file. + /// + /// Dictionary of file names and their associted full paths + public Dictionary FileNameToFilePath() + { + List filePaths = ExperimentAnnotations.Select(psm => psm.File).Distinct().ToList(); + List fileNames = Results.Select(psm => psm.FileName).Distinct().ToList(); + Dictionary allFiles = new Dictionary(); + + foreach (var name in fileNames) + { + string fileName = Path.GetFileName(name); + + // MSFragger results append the raw file with "interact-" and replace .raw with .pep.xml + // In order to correctly match the file names, these changes must be removed + fileName = fileName.Replace("interact-", "").Replace(".pep.xml", ""); + + foreach (var path in filePaths) + { + if (path.Contains(fileName) && !allFiles.ContainsKey(name)) + { + allFiles.Add(name, path); + break; + } + } + } + + return allFiles; + } + + /// + /// Uses the ExperimentAnnotations to locate each psm.tsv file in the results folder. + /// Adds the path to each psm.tsv file in the results folder to AllPsmFilePaths + /// + /// + private void FindAllFilePaths() + { + allPsmFilePaths = new List(); + + List sampleNames = ExperimentAnnotations.Select(psm => psm.SampleName).Distinct().ToList(); + string[] directoryEntries = Directory.GetDirectories(FullFolderPath); + + foreach (var directoryEntry in directoryEntries) + { + string directoryName = Path.GetFileName(directoryEntry.TrimEnd(Path.DirectorySeparatorChar)); + + foreach (var sample in sampleNames) + { + if (directoryName.Equals(sample)) + { + string psmFile = Path.Combine(directoryEntry, "psm.tsv"); + if (!File.Exists(psmFile)) { throw new FileNotFoundException("This psm.tsv file was not found"); } + + allPsmFilePaths.Add(psmFile); + } + } + } + } + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs index 165d1c8d3..f15a2d909 100644 --- a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs +++ b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs @@ -4,10 +4,12 @@ using System.Text; using System.Threading.Tasks; using CsvHelper; +using MassSpectrometry; +using Readers.ExternalResults.BaseClasses; namespace Readers { - public class MsFraggerPsmFile : ResultFile, IResultFile + public class MsFraggerPsmFile : ResultFile, IQuantifiableResultFile { public override SupportedFileType FileType => SupportedFileType.MsFraggerPsm; public override Software Software { get; set; } @@ -38,5 +40,39 @@ public override void WriteResults(string outputPath) csv.WriteRecord(result); } } + + public IEnumerable GetQuantifiableResults() => Results; + + /// + /// Creates a dictionary linking a shortened file name to its corresponding full file path + /// + /// list of all full file paths associted with a given result + /// dictionary with key fileName and value fullFilePath + public Dictionary FileNameToFilePath (List fullFilePath) + { + List rawFileNames = Results.Select(psm => psm.FileName).Distinct().ToList(); + fullFilePath = fullFilePath.Distinct().ToList(); + Dictionary allFiles = new Dictionary(); + + foreach(var fileName in rawFileNames) + { + string shortFileName = Path.GetFileName(fileName); + + // MSFragger results append the raw file with "interact-" and replace .raw with .pep.xml + // In order to correctly match the file names, these changes must be removed + shortFileName = shortFileName.Replace("interact-", "").Replace(".pep.xml", ""); + + foreach(var file in fullFilePath) + { + if (file.Contains(shortFileName) && !allFiles.ContainsKey(fileName)) + { + allFiles.Add(fileName, file); + break; + } + } + } + + return allFiles; + } } -} +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/ResultFiles/MsPathFinderTResultFile.cs b/mzLib/Readers/ExternalResults/ResultFiles/MsPathFinderTResultFile.cs new file mode 100644 index 000000000..5431c1b46 --- /dev/null +++ b/mzLib/Readers/ExternalResults/ResultFiles/MsPathFinderTResultFile.cs @@ -0,0 +1,50 @@ +using CsvHelper; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Easy.Common.Extensions; + +namespace Readers +{ + public class MsPathFinderTResultFile : ResultFile, IResultFile + { + public override SupportedFileType FileType { get; } + public override Software Software { get; set; } + + public MsPathFinderTResultFile(string filePath) : base(filePath, Software.MsPathFinderT) + { + FileType = filePath.ParseFileType(); + } + + public MsPathFinderTResultFile() : base() + { + FileType = FilePath.IsNullOrEmpty() ? SupportedFileType.MsPathFinderTAllResults : FilePath.ParseFileType(); + } + + public override void LoadResults() + { + using var csv = new CsvReader(new StreamReader(FilePath), MsPathFinderTResult.CsvConfiguration); + Results = csv.GetRecords().ToList(); + if (Results.Any() && Results.First().FileNameWithoutExtension.IsNullOrEmpty()) + Results.ForEach(p => p.FileNameWithoutExtension = string.Join("_", Path.GetFileNameWithoutExtension(FilePath).Split('_')[..^1])); + } + + public override void WriteResults(string outputPath) + { + if (!CanRead(outputPath)) + outputPath += FileType.GetFileExtension(); + + using (var csv = new CsvWriter(new StreamWriter(File.Create(outputPath)), MsPathFinderTResult.CsvConfiguration)) + { + csv.WriteHeader(); + foreach (var result in Results) + { + csv.NextRecord(); + csv.WriteRecord(result); + } + } + } + } +} diff --git a/mzLib/Readers/QuantificationResults/QuantifiedPeak.cs b/mzLib/Readers/QuantificationResults/QuantifiedPeak.cs new file mode 100644 index 000000000..6a543488d --- /dev/null +++ b/mzLib/Readers/QuantificationResults/QuantifiedPeak.cs @@ -0,0 +1,94 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using CsvHelper.Configuration; +using CsvHelper.Configuration.Attributes; + +namespace Readers.QuantificationResults +{ + public class QuantifiedPeak + { + public static CsvConfiguration CsvConfiguration = new CsvConfiguration(System.Globalization.CultureInfo.InvariantCulture) + { + Delimiter = "\t", + HasHeaderRecord = true, + IgnoreBlankLines = true, + TrimOptions = CsvHelper.Configuration.TrimOptions.Trim, + }; + + [Name("File Name")] + public string FileName { get; set; } + + [Name("Base Sequence")] + public string BaseSequence { get; set; } + + [Name("Full Sequence")] + public string FullSequence { get; set; } + + [Name("Protein Group")] + public string ProteinGroup { get; set; } + + [Name("Peptide Monoisotopic Mass")] + public double PeptideMonoisotopicMass { get; set; } + + [Name("MS2 Retention Time")] + public double? MS2RetentionTime { get; set; } + + [Name("Precursor Charge")] + public int PrecursorCharge { get; set; } + + [Name("Theoretical MZ")] + public double TheoreticalMZ { get; set; } + + [Name("Peak intensity")] + public double PeakIntensity { get; set;} + + [Name("Peak RT Start")] + [TypeConverter(typeof(DashToNullOrDoubleConverter))] + public double? PeakRTStart { get; set; } + + [Name("Peak RT Apex")] + [TypeConverter(typeof(DashToNullOrDoubleConverter))] + public double? PeakRTApex { get; set; } + + [Name("Peak RT End")] + [TypeConverter(typeof(DashToNullOrDoubleConverter))] + public double? PeakRTEnd { get; set; } + + [Name("Peak MZ")] + [TypeConverter(typeof(DashToNullOrDoubleConverter))] + public double? PeakMz { get; set; } + + [Name("Peak Charge")] + [TypeConverter(typeof(DashToNullOrIntegerConverter))] + public int? PeakCharge { get; set; } + + [Name("Num Charge States Observed")] + public int NumChargeStatesObserved { get; set; } + + [Name("Peak Detection Type")] + public string PeakDetectionType { get; set; } + + [Name("MBR Score")] + [TypeConverter(typeof(DashToNullOrDoubleConverter))] + public double MBRScore { get; set; } + + [Name("PSMs Mapped")] + public int PSMsMapped { get; set; } + + [Name("Base Sequences Mapped")] + public int BaseSequencesMapped { get; set; } + + [Name("Full Sequences Mapped")] + public int FullSequencesMapped { get; set; } + + [Name("Peak Split Valley RT")] + public double PeakSplitValleyRT { get; set; } + + [Name("Peak Apex Mass Error (ppm)")] + [TypeConverter(typeof(DashToNullOrDoubleConverter))] + public double? PeakApexMassError { get; set; } + } +} diff --git a/mzLib/Readers/QuantificationResults/QuantifiedPeakFile.cs b/mzLib/Readers/QuantificationResults/QuantifiedPeakFile.cs new file mode 100644 index 000000000..8f72150c3 --- /dev/null +++ b/mzLib/Readers/QuantificationResults/QuantifiedPeakFile.cs @@ -0,0 +1,37 @@ +using CsvHelper; + +namespace Readers.QuantificationResults; + +public class QuantifiedPeakFile : ResultFile, IResultFile +{ + public override SupportedFileType FileType => SupportedFileType.FlashLFQQuantifiedPeak; + public override Software Software { get; set; } + + public QuantifiedPeakFile(string filePath) : base(filePath, Software.FlashLFQ) { } + + /// + /// Constructor used to initialize from the factory method + /// + public QuantifiedPeakFile() : base() { } + + public override void LoadResults() + { + using var csv = new CsvReader(new StreamReader(FilePath), QuantifiedPeak.CsvConfiguration); + Results = csv.GetRecords().ToList(); + } + + public override void WriteResults(string outputPath) + { + if (!CanRead(outputPath)) + outputPath += FileType.GetFileExtension(); + + using var csv = new CsvWriter(new StreamWriter(File.Create(outputPath)), QuantifiedPeak.CsvConfiguration); + + csv.WriteHeader(); + foreach (var result in Results) + { + csv.NextRecord(); + csv.WriteRecord(result); + } + } +} \ No newline at end of file diff --git a/mzLib/Readers/Readers.csproj b/mzLib/Readers/Readers.csproj index c2a7a1e4b..076ed9e40 100644 --- a/mzLib/Readers/Readers.csproj +++ b/mzLib/Readers/Readers.csproj @@ -1,17 +1,17 @@ - net6.0 + net8.0 x64 enable enable - - - - + + + + diff --git a/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs b/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs index 62a720c63..709b391ba 100644 --- a/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs +++ b/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs @@ -28,7 +28,7 @@ public static List ReadTsv(string filePath, out List - public class SemicolonDelimitedToDoubleListConverter : DefaultTypeConverter + internal class SemicolonDelimitedToDoubleListConverter : DefaultTypeConverter { public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData) { @@ -24,11 +25,13 @@ public override string ConvertToString(object value, IWriterRow row, MemberMapDa } } - public class DashToNullOrDoubleConverter : DefaultTypeConverter + internal class DashToNullOrDoubleConverter : DefaultTypeConverter { public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData) { - return text == "-" ? null : double.Parse(text); + if (text == "-") + return null; + return double.TryParse(text, out var result) ? result : 0.0; } public override string ConvertToString(object value, IWriterRow row, MemberMapData memberMapData) @@ -37,6 +40,21 @@ public override string ConvertToString(object value, IWriterRow row, MemberMapDa } } + public class DashToNullOrIntegerConverter : DefaultTypeConverter + { + public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData) + { + if (text == "-") + return null; + return int.TryParse(text, out var result) ? result : 0; + } + + public override string ConvertToString(object value, IWriterRow row, MemberMapData memberMapData) + { + return value as int? == null ? "-" : value.ToString(); + } + } + public class CommaDelimitedToIntegerArrayTypeConverter : DefaultTypeConverter { public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData) @@ -53,7 +71,7 @@ public override string ConvertToString(object value, IWriterRow row, MemberMapDa } } - public class CommaDelimitedToStringArrayTypeConverter : DefaultTypeConverter + internal class CommaDelimitedToStringArrayTypeConverter : DefaultTypeConverter { public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData) { @@ -69,4 +87,65 @@ public override string ConvertToString(object value, IWriterRow row, MemberMapDa return string.Join(',', list); } } + + /// + /// Converts the chemical formula from MsPathFinderT to MetaMorpheus + /// MsPathFinderT: "C(460) H(740) N(136) O(146) S(0)" + /// MetaMorpheus: "C460H740N136O146S" + /// + internal class MsPathFinderTCompositionToChemicalFormulaConverter : DefaultTypeConverter + { + public override object ConvertFromString(string text, IReaderRow row, MemberMapData memberMapData) + { + var composition = text.Split(' ').Where(p => p != "").ToArray(); + var chemicalFormula = new Chemistry.ChemicalFormula(); + foreach (var element in composition) + { + var elementSplit = element.Split('('); + var elementName = elementSplit[0]; + var elementCount = int.Parse(elementSplit[1].Replace(")", "")); + chemicalFormula.Add(elementName, elementCount); + } + return chemicalFormula; + } + + public override string ConvertToString(object value, IWriterRow row, MemberMapData memberMapData) + { + var chemicalFormula = value as Chemistry.ChemicalFormula ?? throw new Exception("Cannot convert input to ChemicalFormula"); + var sb = new StringBuilder(); + + bool onNumber = false; + foreach (var character in chemicalFormula.Formula) + { + if (!char.IsDigit(character)) // if is a letter + { + if (onNumber) + { + sb.Append(") " + character); + onNumber = false; + } + else + sb.Append(character); + } + else + { + if (!onNumber) + { + sb.Append("(" + character); + onNumber = true; + } + else + sb.Append(character); + } + } + + var stringForm = sb.ToString(); + if (char.IsDigit(stringForm.Last())) + stringForm += ")"; + else + stringForm += "(1)"; + + return stringForm; + } + } } diff --git a/mzLib/Readers/Util/Software.cs b/mzLib/Readers/Util/Software.cs index 1cacfe60c..f3e20bb74 100644 --- a/mzLib/Readers/Util/Software.cs +++ b/mzLib/Readers/Util/Software.cs @@ -1,5 +1,4 @@ - -namespace Readers +namespace Readers { public enum Software { @@ -11,5 +10,8 @@ public enum Software MaxQuant, Toppic, MsFragger, // files tested were from fragpipe v21.1 + FlashLFQ, + MsPathFinderT, + Crux } } diff --git a/mzLib/Readers/Util/SupportedFileTypes.cs b/mzLib/Readers/Util/SupportedFileTypes.cs index 008662759..371caf455 100644 --- a/mzLib/Readers/Util/SupportedFileTypes.cs +++ b/mzLib/Readers/Util/SupportedFileTypes.cs @@ -22,6 +22,12 @@ public enum SupportedFileType MsFraggerPsm, MsFraggerPeptide, MsFraggerProtein, + FlashLFQQuantifiedPeak, + MsPathFinderTTargets, + MsPathFinderTDecoys, + MsPathFinderTAllResults, + CruxResult, + ExperimentAnnotation } public static class SupportedFileTypeExtensions @@ -54,10 +60,15 @@ public static string GetFileExtension(this SupportedFileType type) SupportedFileType.MsFraggerPsm => "psm.tsv", SupportedFileType.MsFraggerPeptide => "peptide.tsv", SupportedFileType.MsFraggerProtein => "protein.tsv", + SupportedFileType.FlashLFQQuantifiedPeak => "Peaks.tsv", + SupportedFileType.MsPathFinderTTargets => "_IcTarget.tsv", + SupportedFileType.MsPathFinderTDecoys => "_IcDecoy.tsv", + SupportedFileType.MsPathFinderTAllResults => "_IcTDA.tsv", + SupportedFileType.CruxResult => ".txt", + SupportedFileType.ExperimentAnnotation => "experiment_annotation.tsv", _ => throw new MzLibException("File type not supported") }; } - public static SupportedFileType ParseFileType(this string filePath) { switch (Path.GetExtension(filePath).ToLower()) @@ -99,6 +110,16 @@ public static SupportedFileType ParseFileType(this string filePath) return SupportedFileType.MsFraggerPeptide; if (filePath.EndsWith(SupportedFileType.MsFraggerProtein.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) return SupportedFileType.MsFraggerProtein; + if (filePath.EndsWith(SupportedFileType.FlashLFQQuantifiedPeak.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) + return SupportedFileType.FlashLFQQuantifiedPeak; + if (filePath.EndsWith(SupportedFileType.MsPathFinderTTargets.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) + return SupportedFileType.MsPathFinderTTargets; + if (filePath.EndsWith(SupportedFileType.MsPathFinderTDecoys.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) + return SupportedFileType.MsPathFinderTDecoys; + if (filePath.EndsWith(SupportedFileType.MsPathFinderTAllResults.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) + return SupportedFileType.MsPathFinderTAllResults; + if(filePath.EndsWith(SupportedFileType.ExperimentAnnotation.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) + return SupportedFileType.ExperimentAnnotation; // these tsv cases are just .tsv and need an extra step to determine the type // currently need to distinguish between FlashDeconvTsv and MsFraggerPsm @@ -111,6 +132,11 @@ public static SupportedFileType ParseFileType(this string filePath) throw new MzLibException("Tsv file type not supported"); } + case ".txt": + if (filePath.EndsWith(SupportedFileType.CruxResult.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) + return SupportedFileType.CruxResult; + throw new MzLibException("Txt file type not supported"); + default: throw new MzLibException("File type not supported"); } diff --git a/mzLib/SpectralAveraging/Algorithms/SpectraFileAveraging.cs b/mzLib/SpectralAveraging/Algorithms/SpectraFileAveraging.cs index 28a573ac8..35839d1a5 100644 --- a/mzLib/SpectralAveraging/Algorithms/SpectraFileAveraging.cs +++ b/mzLib/SpectralAveraging/Algorithms/SpectraFileAveraging.cs @@ -54,7 +54,7 @@ private static MsDataScan[] AverageAll(IReadOnlyCollection scans, Sp // create output MsDataScan averagedScan = new(averagedSpectrum, 1, representativeScan.OneBasedScanNumber, representativeScan.IsCentroid, representativeScan.Polarity, scans.Select(p => p.RetentionTime).Average(), - averagedSpectrum.Range, null, representativeScan.MzAnalyzer, scans.Select(p => p.TotalIonCurrent).Average(), + averagedSpectrum.Range, representativeScan.ScanFilter, representativeScan.MzAnalyzer, scans.Select(p => p.TotalIonCurrent).Average(), representativeScan.InjectionTime, null, representativeScan.NativeId); MsDataScan[] msDataScans = { averagedScan }; return msDataScans; @@ -129,7 +129,8 @@ private static MsDataScan GetAveragedDataScanFromAveragedSpectrum(MzSpectrum ave centralScan.IsCentroid, centralScan.Polarity, centralScan.RetentionTime, - averagedSpectrum.Range, null, + averagedSpectrum.Range, + centralScan.ScanFilter, centralScan.MzAnalyzer, averagedSpectrum.SumOfAllY, centralScan.InjectionTime, diff --git a/mzLib/SpectralAveraging/SpectralAveraging.csproj b/mzLib/SpectralAveraging/SpectralAveraging.csproj index 96bc3a953..a946e80d1 100644 --- a/mzLib/SpectralAveraging/SpectralAveraging.csproj +++ b/mzLib/SpectralAveraging/SpectralAveraging.csproj @@ -1,6 +1,6 @@  - net6.0 + net8.0 x64 @@ -15,6 +15,7 @@ + diff --git a/mzLib/Test/AveragingTests/TestSpectraFileAveraging.cs b/mzLib/Test/AveragingTests/TestSpectraFileAveraging.cs index 3fb4c8428..f46db019d 100644 --- a/mzLib/Test/AveragingTests/TestSpectraFileAveraging.cs +++ b/mzLib/Test/AveragingTests/TestSpectraFileAveraging.cs @@ -1,15 +1,10 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics; +using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.IO; using System.Linq; -using System.Reflection; -using System.Text; using MassSpectrometry; -using MzLibUtil; using NUnit.Framework; -using Proteomics.ProteolyticDigestion; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Readers; using SpectralAveraging; diff --git a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs index fe3e70c06..8925661cb 100644 --- a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs +++ b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs @@ -1,4 +1,4 @@ -// opyright 2016 Stefan Solntsev +// Copyright 2016 Stefan Solntsev // // This file (ChemicalFormula.cs) is part of Chemistry Library. // @@ -19,6 +19,7 @@ using MassSpectrometry; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics; using System; using System.Collections.Generic; @@ -27,6 +28,7 @@ using Omics.Modifications; using UsefulProteomicsDatabases; using Stopwatch = System.Diagnostics.Stopwatch; +using NUnit.Framework.Legacy; namespace Test.DatabaseTests { @@ -80,6 +82,39 @@ public static void LoadIsoforms() Assert.AreEqual("Q14103-4", proteinXml[9].Accession); } + [Test] + [TestCase("cRAP_databaseGPTMD.xml", DecoyType.None)] + [TestCase("uniprot_aifm1.fasta", DecoyType.None)] + [TestCase("cRAP_databaseGPTMD.xml", DecoyType.Reverse)] + [TestCase("uniprot_aifm1.fasta", DecoyType.Reverse)] + public void LoadingIsReproducible(string fileName, DecoyType decoyType) + { + // Load in proteins + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName); + List proteins1 = null; + List proteins2 = null; + if(fileName.Contains(".xml")) + { + proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications); + } + else if (fileName.Contains(".fasta")) + { + proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications); + } + else + { + Assert.Fail("Unknown file type"); + } + + // check are equivalent lists of proteins + Assert.AreEqual(proteins1.Count, proteins2.Count); + // Because decoys are written in a parallel environment, there is no guarantee that the orders will be the same + CollectionAssert.AreEquivalent(proteins1.Select(p => p.Accession), proteins2.Select(p => p.Accession)); + CollectionAssert.AreEquivalent(proteins1.Select(p => p.BaseSequence), proteins2.Select(p => p.BaseSequence)); + } + [Test] public static void LoadModWithNl() { diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs index 7c5affb4e..76915eb9a 100644 --- a/mzLib/Test/DatabaseTests/TestProteinReader.cs +++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs @@ -21,6 +21,7 @@ using System.IO; using System.Linq; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Omics.Modifications; using Proteomics; using UsefulProteomicsDatabases; diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs index c48034df4..d0212b6de 100644 --- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs +++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs @@ -4,6 +4,7 @@ using System.Linq; using MassSpectrometry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Omics.Fragmentation; using Omics.Modifications; using Proteomics; diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index a16232ab0..5d8f551ab 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -3,6 +3,7 @@ using System.IO; using System.Linq; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Omics.Modifications; using Proteomics; using Proteomics.ProteolyticDigestion; diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/protein.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/protein.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/psm.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/psm.tsv new file mode 100644 index 000000000..3b205c248 --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/psm.tsv @@ -0,0 +1,5 @@ +Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00906.00906.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml VKEDPDGEHAR SISGRPIK.VKEDPDGEHAR.RAMQKVMA K R 11 3 2111.248 1251.5845 1251.5914 418.2021 418.2044 1251.5842 418.202 0.0072 0.05469976 15.518 11.386 0.8908 2 1 144 154 208463.97 0 FALSE sp|P52272|HNRPM_HUMAN P52272 HNRPM_HUMAN HNRNPM Heterogeneous nuclear ribonucleoprotein M "tr|M0QYQ7|M0QYQ7_HUMAN, tr|M0R019|M0R019_HUMAN, tr|M0R0N3|M0R0N3_HUMAN, tr|M0R2T0|M0R2T0_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00917.00917.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml NEEDEGHSNSSPR GAKIDASK.NEEDEGHSNSSPR.HSEAATAQ K H 13 3 2113.596 1456.5808 1456.5822 486.5342 486.5347 1456.5814 486.5344 0.0007 0.007893147 17.911 0 1 2 0 73 85 349264.44 0 FALSE sp|Q14103|HNRPD_HUMAN Q14103 HNRPD_HUMAN HNRNPD Heterogeneous nuclear ribonucleoprotein D0 "tr|A0A994J4B1|A0A994J4B1_HUMAN, tr|A0A994J4R1|A0A994J4R1_HUMAN, tr|D6RAF8|D6RAF8_HUMAN, tr|D6RD83|D6RD83_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00947.00947.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml VGQADDSTKPTNK IGSFSGIR.VGQADDSTKPTNK.ASSTSITS R A 13 3 2120.3625 1359.6602 1359.6622 454.2273 454.228 1359.663 454.2283 -0.0007 0.001409289 12.904 0 0.9994 2 1 1339 1351 171548.62 0 FALSE sp|P35658|NU214_HUMAN P35658 NU214_HUMAN NUP214 Nuclear pore complex protein Nup214 "tr|A0A494C1F2|A0A494C1F2_HUMAN, tr|A0A8Q3SHZ4|A0A8Q3SHZ4_HUMAN, tr|B7ZAV2|B7ZAV2_HUMAN, tr|E9PKD2|E9PKD2_HUMAN, tr|H0Y837|H0Y837_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.01021.01021.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml AEQEAEEPRK IAERARIK.AEQEAEEPRK.THSEEFTN K T 10 3 2136.586 1185.5615 1185.5641 396.1944 396.1953 1185.5625 396.1948 0.0015 0.151182 10.782 0 0.9548 2 1 106 115 125972.164 0 FALSE sp|Q9H788|SH24A_HUMAN Q9H788 SH24A_HUMAN SH2D4A SH2 domain-containing protein 4A tr|H0YAT1|H0YAT1_HUMAN diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/protein.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/protein.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/psm.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/psm.tsv new file mode 100644 index 000000000..f010cee8d --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/psm.tsv @@ -0,0 +1,5 @@ +Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01005.01005.2 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml HAVSEGTK IIPGEIAK.HAVSEGTK.AVTKYTSA K A 8 2 1938.0153 827.4154 827.4152 414.715 414.7149 827.4137 414.7141 0.0015 1.57772E-05 21.993 14.41 0.9994 2 0 110 117 8907404 0 FALSE sp|O60814|H2B1K_HUMAN O60814 H2B1K_HUMAN H2BC12 Histone H2B type 1-K "H2BC1, H2BC11, H2BC12L, H2BC13, H2BC14, H2BC15, H2BC17, H2BC18, H2BC21, H2BC26, H2BC3, H2BC4, H2BC5, H2BC9, H2BK1" "sp|A0A2R8Y619|H2BK1_HUMAN, sp|P06899|H2B1J_HUMAN, sp|P23527|H2B1O_HUMAN, sp|P33778|H2B1B_HUMAN, sp|P57053|H2BFS_HUMAN, sp|P58876|H2B1D_HUMAN, sp|P62807|H2B1C_HUMAN, sp|Q16778|H2B2E_HUMAN, sp|Q5QNW6|H2B2F_HUMAN, sp|Q8N257|H2B3B_HUMAN, sp|Q93079|H2B1H_HUMAN, sp|Q96A08|H2B1A_HUMAN, sp|Q99877|H2B1N_HUMAN, sp|Q99879|H2B1M_HUMAN, sp|Q99880|H2B1L_HUMAN, tr|U3KQK0|U3KQK0_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01551.01551.2 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml YDSTHGR DYAAYMFK.YDSTHGR.YAGEVSHD K Y 7 2 1976.3535 834.3639 834.3626 418.1892 418.1886 834.362 418.1883 0.0005 0.01685582 15.593 0 0.9997 2 0 47 53 1.96E+07 0 FALSE sp|P00359|G3P3_YEAST P00359 G3P3_YEAST TDH3 Glyceraldehyde-3-phosphate dehydrogenase 3 "GAPDHS, TDH1, TDH2" "sp|O14556|G3PT_HUMAN, sp|P00358|G3P2_YEAST, sp|P00360|G3P1_YEAST, tr|K7EP73|K7EP73_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01565.01565.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml AESSQTCHSEQGDK AESSQTCHSEQGDK KSTQNSFR.AESSQTCHSEQGDK.KMEEKNSG R K 14 3 1977.164 1562.6292 1562.6265 521.8837 521.8828 1562.6267 521.8828 -0.0002 8.61986E-05 27.758 10.626 1 2 0 600 613 4579535.5 7C(57.0215) 0 TRUE sp|P46063|RECQ1_HUMAN P46063 RECQ1_HUMAN RECQL ATP-dependent DNA helicase Q1 +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01607.01607.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml VTSTGRPGHASR ERSPWWVR.VTSTGRPGHASR.FMEDTAAE R F 12 3 1979.2815 1224.6323 1224.6318 409.218 409.2179 1224.6322 409.218 -0.0003 0.9328038 10.648 0 0.9687 2 1 198 209 1587950.4 0 FALSE sp|Q03154|ACY1_HUMAN Q03154 ACY1_HUMAN ACY1 Aminoacylase-1 ABHD14A-ACY1 "tr|A0A1B0GU86|A0A1B0GU86_HUMAN, tr|A0A1B0GV31|A0A1B0GV31_HUMAN, tr|A0A1B0GVA5|A0A1B0GVA5_HUMAN, tr|A0A1B0GW23|A0A1B0GW23_HUMAN, tr|C9JMV9|C9JMV9_HUMAN, tr|C9JYZ0|C9JYZ0_HUMAN" diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/combined_ion.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/combined_ion.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/copy_experiment_annotation.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/copy_experiment_annotation.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/experiment_annotation.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/experiment_annotation.tsv new file mode 100644 index 000000000..3421dff60 --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/experiment_annotation.tsv @@ -0,0 +1,3 @@ +file sample sample_name condition replicate +E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.raw A_1 A_1 A 1 +E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.raw A_2 A_2 A 2 diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/FlashLFQ_MzLib1.0.549_QuantifiedPeaks.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/FlashLFQ_MzLib1.0.549_QuantifiedPeaks.tsv new file mode 100644 index 000000000..abd0e4f07 --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/FlashLFQ_MzLib1.0.549_QuantifiedPeaks.tsv @@ -0,0 +1,5 @@ +File Name Base Sequence Full Sequence Protein Group Peptide Monoisotopic Mass MS2 Retention Time Precursor Charge Theoretical MZ Peak intensity Peak RT Start Peak RT Apex Peak RT End Peak MZ Peak Charge Num Charge States Observed Peak Detection Type MBR Score PSMs Mapped Base Sequences Mapped Full Sequences Mapped Peak Split Valley RT Peak Apex Mass Error (ppm) +20100721_Velos1_TaGe_SA_A549_06-calib-averaged DICNDVLSLLEK DIC[Common Fixed:Carbamidomethyl on C]NDVLSLLEK P63104 1417.712281191 188.04623 3 473.578036863879 61339740.974156484 187.71813 188.27129333333335 195.134625 709.8649279277056 2 3 MSMS 2 1 1 0 2.1314131880687888 +20100721_Velos1_TaGe_SA_A549_06-calib-averaged LQLETEIEALKEELLFMK LQLETEIEALKEELLFM[Common Variable:Oxidation on M]K P05783 2192.165023412 201.15724833333334 4 549.0485323198791 60906044.57712504 195.18546166666667 198.654625 239.99995166666665 732.0634256010886 3.4 3 MSMS 28 1 1 195.134625 0.20472510927596868 +20100721_Velos1_TaGe_SA_A549_06-calib-averaged IFVEESIYDEFVR IFVEESIYDEFVR P00352 1644.80353855 163.62967333333333 2 823.409045741879 50723320.59314618 159.73646000000002 160.07629166666666 167.865625 823.408785904915 2 3 MSMS 23 1 1 159.4417916666667 -0.3159489359085651 +20101230_Velos1_TaGe_SA_Jurkat6-calib-averaged QDLEAQIRGLREEVEK QDLEAQIRGLREEVEK A1A5D9 1912.001403494 71.64922 4 479.007627340379 0 - - - - - 0 MSMS 1 1 1 0 NaN diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_AllResults_IcTda.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_AllResults_IcTda.tsv new file mode 100644 index 000000000..2fed9fd0f --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_AllResults_IcTda.tsv @@ -0,0 +1,7 @@ +Scan Pre Sequence Post Modifications Composition ProteinName ProteinDesc ProteinLength Start End Charge MostAbundantIsotopeMz Mass Ms1Features #MatchedFragments Probability SpecEValue EValue QValue PepQValue +1180 M PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD - C(442) H(756) N(140) O(156) S(0) sp|P05114|HMGN1_HUMAN Non-histone chromosomal protein HMG-14 OS=Homo sapiens OX=9606 GN=HMGN1 PE=1 SV=3 101 2 100 15 702.8454697 10521.55277 0 61 1.0 9.99E-308 9.99E-308 0 0 +1180 - EKTEDKAQDTVAITQIFINGQYGEDAHKDYERGSDRIKKYKVRSGKKLEGTMDEYNIISAKKNVHKATKNRQKYNKINELIGQSNLYSRGF T C(460) H(740) N(136) O(146) S(1) XXX_sp|Q2G1S6|SSL5_STAA8 "Staphylococcal superantigen-like 5 OS=Staphylococcus aureus (strain NCTC 8325 " 235 1 91 15 703.9044982 10537.4382 670 5 0.0017 2.399925E-07 0.009145 10 10 +1180 T DKGQATLRLANIALFVKVHFMSPIKWSATFFMALLYGAEGSIHKLLGNLIPKVRAVIIKKDIVGSSVRLN - C(358) H(585) N(95) O(89) S(2) XXX_sp|Q2GGT3|SYR_EHRCR "Arginine--tRNA ligase OS=Ehrlichia chaffeensis (strain ATCC CRL-10679 " 577 507 576 11 701.6776982 7703.361219 427 12 7.2369E-05 2.143132E-05 0.816662 10 10 +1180 - MSQDTEVDMKDVELNELEPEKQPMNAADGAAAGEKNGLVKIKVAEDETEAGVKFTGLSKEELLKVAGSPGWVRTRWALLLLFWLGWLGMLAGAVVII V C(473) H(758) N(120) O(142) S(4) sp|P10852|4F2_MOUSE Amino acid transporter heavy chain SLC3A2 OS=Mus musculus OX=10090 GN=Slc3a2 PE=1 SV=1 527 1 97 15 702.7063795 10519.46642 660 9 3.1386E-04 1.311625E-06 0.049981 10 10 +1180 - MAPGSVTSDISPSSTSTAGSSRSPESEKPGPSHGGVPPGGPSHSSLPVGRRHPPVLRMVLEALQAGEQRRGTSVAAIKLYILHKYPTVDVLRFKYLLKQA L C(463) H(753) N(137) O(139) S(2) sp|Q8IZA3|H18_HUMAN Histone H1.8 OS=Homo sapiens OX=9606 GN=H1-8 PE=2 SV=1 347 1 100 15 702.7786627 10520.55066 661 8 8.4705E-05 1.802522E-06 0.068687 10 10 +1181 M PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD - C(442) H(756) N(140) O(156) S(0) sp|P05114|HMGN1_HUMAN Non-histone chromosomal protein HMG-14 OS=Homo sapiens OX=9606 GN=HMGN1 PE=1 SV=3 101 2 100 14 752.9767692 10521.55277 0 60 1.0 9.99E-308 9.99E-308 0 0 diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_DecoyResults_IcDecoy.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_DecoyResults_IcDecoy.tsv new file mode 100644 index 000000000..68e245455 --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_DecoyResults_IcDecoy.tsv @@ -0,0 +1,7 @@ +Scan Pre Sequence Post Modifications Composition ProteinName ProteinDesc ProteinLength Start End Charge MostAbundantIsotopeMz Mass Ms1Features #MatchedFragments Probability SpecEValue EValue +1180 - EKTEDKAQDTVAITQIFINGQYGEDAHKDYERGSDRIKKYKVRSGKKLEGTMDEYNIISAKKNVHKATKNRQKYNKINELIGQSNLYSRGF T C(460) H(740) N(136) O(146) S(1) XXX_sp|Q2G1S6|SSL5_STAA8 "Staphylococcal superantigen-like 5 OS=Staphylococcus aureus (strain NCTC 8325 " 235 1 91 15 703.9044982 10537.4382 670 5 0.0017 2.399925E-07 0.009145 +1180 T DKGQATLRLANIALFVKVHFMSPIKWSATFFMALLYGAEGSIHKLLGNLIPKVRAVIIKKDIVGSSVRLN - C(358) H(585) N(95) O(89) S(2) XXX_sp|Q2GGT3|SYR_EHRCR "Arginine--tRNA ligase OS=Ehrlichia chaffeensis (strain ATCC CRL-10679 " 577 507 576 11 701.6776982 7703.361219 427 12 7.2369E-05 2.143132E-05 0.816662 +1181 R RGRLFKAHFTMLLSVLHKAGAALKPRPAGSVQLYCHLLRPPRPATPAAEPFGHPEGADCPFSTFSLETRGIGPYLSLAYIFLNKDGRSRLRTYGCA P C(477) H(745) N(137) O(124) S(4) XXX_sp|Q6UXT8|ALKL1_HUMAN ALK and LTK ligand 1 OS=Homo sapiens OX=9606 GN=ALKAL1 PE=1 SV=1 130 2 97 14 751.7593204 10504.50849 655 9 0.0152 5.839E-08 0.002225 +1181 - EKTEDKAQDTVAITQIFINGQYGEDAHKDYERGSDRIKKYKVRSGKKLEGTMDEYNIISAKKNVHKATKNRQKYNKINELIGQSNLYSRGF T C(460) H(740) N(136) O(146) S(1) XXX_sp|Q2G1S6|SSL5_STAA8 "Staphylococcal superantigen-like 5 OS=Staphylococcus aureus (strain NCTC 8325 " 235 1 91 14 754.1114426 10537.4382 670 4 1.6191E-04 9.002121E-07 0.034303 +1181 - SALTILDEVVDLVAKTIEVIKLTSYDANGKQFNQEEFITNIPSVLSGYVTLADDANAFIDAPSDAKSRFGTGEFKVSKEIGLRVLHNRITNGGIQVQ K C(469) H(752) N(124) O(150) S(0) XXX_sp|P10592|HSP72_YEAST "Heat shock protein SSA2 OS=Saccharomyces cerevisiae (strain ATCC 204508 " 640 1 97 14 752.9731998 10521.5028 662 8 1.1724E-04 1.072104E-06 0.040854 +1181 N PAELTKRVQLDMVCRIKEPLRSVKPLPSLAAPSVSSPQLASKSAATKVSMSFSAYMLKMISPVNRAQRCSPQSYRTHVTSPMKRSAVVSEYLSHDP T C(459) H(763) N(133) O(136) S(7) XXX_sp|Q9PTQ7|DMRT1_CHICK Doublesex- and mab-3-related transcription factor 1 OS=Gallus gallus OX=9031 GN=DMRT1 PE=2 SV=2 366 2 97 14 754.1867306 10538.49223 671 8 1.8379E-04 2.215391E-06 0.08442 diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_TargetResults_IcTarget.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_TargetResults_IcTarget.tsv new file mode 100644 index 000000000..e27dbb8dc --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/MsPathFinderT_TargetResults_IcTarget.tsv @@ -0,0 +1,7 @@ +Scan Pre Sequence Post Modifications Composition ProteinName ProteinDesc ProteinLength Start End Charge MostAbundantIsotopeMz Mass Ms1Features #MatchedFragments Probability SpecEValue EValue +592 M PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK - C(395) H(673) N(129) O(127) S(0) sp|P05204|HMGN2_HUMAN Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3 91 2 90 14 662.5096855 9256.016953 531 17 0.9999 1.642469E-36 6.258791E-32 +595 M PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK - C(395) H(673) N(129) O(127) S(0) sp|P05204|HMGN2_HUMAN Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3 91 2 90 15 618.4095249 9256.016953 531 12 0.2209 2.528896E-17 9.636612E-13 +1089 M PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK - C(395) H(673) N(129) O(127) S(0) sp|P05204|HMGN2_HUMAN Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3 91 2 90 15 618.4095249 9256.016953 530 6 1.1566E-04 1.241835E-14 4.732136E-10 +1090 M PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK - C(395) H(673) N(129) O(127) S(0) sp|P05204|HMGN2_HUMAN Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3 91 2 90 14 662.5096855 9256.016953 530 7 0.0039 1.27173E-17 4.846056E-13 +1169 M PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD - C(442) H(756) N(140) O(156) S(0) sp|P05114|HMGN1_HUMAN Non-histone chromosomal protein HMG-14 OS=Homo sapiens OX=9606 GN=HMGN1 PE=1 SV=3 101 2 100 15 702.8454697 10521.55277 662 13 0.9938 6.040125E-20 2.30165E-15 +1169 M PKRKSPENTEGKDGSKVTKQEPTRRSARLSAKPAPPKPEPKPRKTSAKKEPGAKISRGAKGKKEEKQEAGKEGTAPSENGETKAEEAQKTESVDNEGE - C(442) H(749) N(141) O(156) S(0) sp|Q15651|HMGN3_HUMAN High mobility group nucleosome-binding domain-containing protein 3 OS=Homo sapiens OX=9606 GN=HMGN3 PE=1 SV=2 100 2 99 15 703.3086896 10528.50107 666 7 0.161 3.706717E-14 1.412482E-09 diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv new file mode 100644 index 000000000..a87d8493f --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/SmallCalibratibleYeastFragger_psm.tsv @@ -0,0 +1,6 @@ +Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +SmallCalibratibleYeast.00002.00002.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml RGNVCGDAK RGNVCGDAK NVSVKEIR.RGNVCGDAK.NDPPKGCA R N 9 2 1443.6643 975.4455 975.4455 488.7300 488.7300 975.4556 488.7351 -0.0101 0.00047835460000 25.7810 11.6900 0.9997 2 1 320 328 1.81151584E8 5C(57.0215) 0.00 true sp|P02994|EF1A_YEAST P02994 EF1A_YEAST TEF1 Elongation factor 1-alpha +SmallCalibratibleYeast.00004.00004.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml EKAEAEAEK GIREKRAR.EKAEAEAEK.KK R K 9 2 1444.1241 1003.4855 1003.4855 502.7500 502.7500 1003.4821 502.7483 0.0033 0.02974116000000 18.6380 12.6310 0.9269 2 1 189 197 3.6544424E7 0.00 false sp|P40212|RL13B_YEAST P40212 RL13B_YEAST RPL13B Large ribosomal subunit protein eL13B RPL13A sp|Q12690|RL13A_YEAST +SmallCalibratibleYeast.00008.00008.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml KITSNQR FNVPIDGK.KITSNQR.IVAAIPTI K I 7 2 1446.0237 845.4655 845.4655 423.7400 423.7400 845.4719 423.7432 -0.0064 0.13056640000000 17.5940 12.5170 0.9646 2 1 33 39 2.5555878E7 0.00 true sp|P00560|PGK_YEAST P00560 PGK_YEAST PGK1 Phosphoglycerate kinase +SmallCalibratibleYeast.00009.00009.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml GIDHTSK .GIDHTSK.QHKRSGHR M Q 7 2 1446.2545 756.3855 756.3855 379.2000 379.2000 756.3766 379.1956 0.0089 0.01669167000000 17.6100 9.8270 0.9965 2 0 2 8 6.739196E7 0.00 false sp|P0CX49|RL18A_YEAST P0CX49 RL18A_YEAST RPL18A Large ribosomal subunit protein eL18A RPL18B sp|P0CX50|RL18B_YEAST +SmallCalibratibleYeast.00010.00010.2 E:\MadeleineH\YeastProteomeMSFragger\1\interact-SmallCalibratibleYeast.pep.xml EKAEAEAEK GIREKRAR.EKAEAEAEK.KK R K 9 2 1446.4789 1003.4855 1003.4855 502.7500 502.7500 1003.4821 502.7483 0.0033 0.02315876000000 18.5780 12.4510 0.9875 2 1 189 197 3.6544424E7 0.00 false sp|P40212|RL13B_YEAST P40212 RL13B_YEAST RPL13B Large ribosomal subunit protein eL13B RPL13A sp|Q12690|RL13A_YEAST diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/crux.txt b/mzLib/Test/FileReadingTests/ExternalFileTypes/crux.txt new file mode 100644 index 000000000..aeff49496 --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/crux.txt @@ -0,0 +1,15 @@ +file scan charge retention time spectrum precursor m/z spectrum neutral mass peptide mass delta_cn xcorr score xcorr rank tailor score tdc q-value b/y ions matched b/y ions total b/y ions fraction b/y ion repeat match distinct matches/spectrum sequence unmodified sequence protein id flanking aa +/hdd/data/PXD005590/B02_21_161103_D4_HCD_OT_4ul.raw.mzXML 14674 3 2747.6599 1075.1815 3222.5227 3222.5222 0.84335566 6.4364114 1 1.9659604 3.8850189e-06 51 116 0.43965518 0 68 RPQYSNPPVQGEVMEGADNQGAGEQGRPVR RPQYSNPPVQGEVMEGADNQGAGEQGRPVR sp|P67809|YBOX1_HUMAN(205) RQ +/hdd/data/PXD005590/B02_20_161103_E4_HCD_OT_4ul.raw.mzXML 15417 3 2814.6499 1075.182 3222.5242 3222.5222 0.86036599 6.3186069 1 1.9550625 3.8850189e-06 48 116 0.41379309 3 68 RPQYSNPPVQGEVMEGADNQGAGEQGRPVR RPQYSNPPVQGEVMEGADNQGAGEQGRPVR sp|P67809|YBOX1_HUMAN(205) RQ +/hdd/data/PXD005590/B02_18_161103_B4_HCD_OT_4ul.raw.mzXML 6847 4 2012.87 918.185 3668.7109 3668.7124 0.83817238 6.7191076 1 1.9478002 3.8850189e-06 53 191 0.27748692 0 64 AASAAGAAGSAGGSSGAAGAAGGGAGAGTRPGDGGTASAGAAGPGAATK AASAAGAAGSAGGSSGAAGAAGGGAGAGTRPGDGGTASAGAAGPGAATK sp|Q9UKY7|CDV3_HUMAN(28) RA +/hdd/data/PXD005590/B02_06_161103_A1_HCD_OT_4ul.raw.mzXML 74906 3 8146.6001 1004.5292 3010.5659 3010.5623 0.86094695 6.1447253 1 1.9289217 3.8850189e-06 39 116 0.33620691 0 122 HIADLAGNSEVILPVPAFNVINGGSHAGNK HIADLAGNSEVILPVPAFNVINGGSHAGNK sp|P06733|ENOA_HUMAN(133) RL +/hdd/data/PXD005590/B02_22_161103_D1_HCD_OT_4ul.raw.mzXML 65300 3 7277.5698 867.7704 2600.2896 2600.2869 0.86649311 6.2026334 1 1.9265088 3.8850189e-06 37 96 0.38541666 0 160 NHDTGVSPVFAGGVEYAITPEIATR NHDTGVSPVFAGGVEYAITPEIATR sp|P0A910|OMPA_ECOLI(135) KL +/hdd/data/PXD005590/B02_11_161103_D2_HCD_OT_4ul.raw.mzXML 32062 4 4347.98 668.6035 2670.3848 2670.3875 0.82502377 6.3233223 1 1.9088538 3.8850189e-06 41 95 0.43157896 0 102 EEHEVAVLGAPHNPAPPTSTVIHIR EEHEVAVLGAPHNPAPPTSTVIHIR sp|Q01628|IFM3_HUMAN(25) KS +/hdd/data/PXD005590/B02_16_161103_A3_HCD_OT_4ul.raw.mzXML 51309 4 6030.9102 884.4734 3533.8647 3533.8586 0.86957496 6.0390177 1 1.9073439 3.8850189e-06 35 139 0.25179857 0 26 AHSSPASLQLGAVSPGTLTPTGVVSGPAATPTAQHLR AHSSPASLQLGAVSPGTLTPTGVVSGPAATPTAQHLR sp|P46937|YAP1_HUMAN(125) RQ +/hdd/data/PXD005590/B02_21_161103_D4_HCD_OT_4ul.raw.mzXML 59327 4 6765.9102 771.1469 3080.5586 3080.561 0.81879568 6.7819619 1 1.9058784 3.8850189e-06 43 127 0.33858269 0 82 GAAAQGQTQTVAAQAQALAAQAAAAAHAAQAHR GAAAQGQTQTVAAQAQALAAQAAAAAHAAQAHR sp|Q9BTU6|P4K2A_HUMAN(67) RE +/hdd/data/PXD005590/B02_06_161103_A1_HCD_OT_4ul.raw.mzXML 4435 3 1815.64 751.6635 2251.9685 2251.9666 0.84570491 6.0909443 1 1.9016477 3.8850189e-06 38 92 0.41304347 0 112 APKPDGPGGGPGGSHMGGNYGDDR APKPDGPGGGPGGSHMGGNYGDDR sp|P35637|FUS_HUMAN(449) KR +/hdd/data/PXD005590/B02_21_161103_D4_HCD_OT_4ul.raw.mzXML 19161 3 3172.6001 827.3565 2479.0476 2479.0457 0.86390048 5.704639 1 1.8994089 3.8850189e-06 38 92 0.41304347 0 52 QDHPSSMGVYGQESGGFSGPGENR QDHPSSMGVYGQESGGFSGPGENR sp|Q01844|EWS_HUMAN(269) RS +/hdd/data/PXD005590/B02_24_161103_C1_HCD_OT_4ul.raw.mzXML 58893 2 6700.6001 1396.1667 2790.3188 2790.3218 0.85683089 5.7747865 1 1.897424 3.8850189e-06 31 52 0.59615386 3 134 HTGPGILSMANAGPNTNGSQFFICTAK HTGPGILSMANAGPNTNGSQFFICTAK sp|P62937|PPIA_HUMAN(92) KT +/hdd/data/PXD005590/B02_001_161103_B1_HCD_OT_4ul.raw.mzXML 17264 3 2921.5601 1075.1821 3222.5247 3222.5222 0.81186515 5.8244729 1 1.8960458 3.8850189e-06 49 116 0.4224138 0 68 RPQYSNPPVQGEVMEGADNQGAGEQGRPVR RPQYSNPPVQGEVMEGADNQGAGEQGRPVR sp|P67809|YBOX1_HUMAN(205) RQ +/hdd/data/PXD005590/B02_19_161103_C4_HCD_OT_4ul.raw.mzXML 72508 3 7969.2202 1004.5298 3010.5674 3010.5623 0.85636955 5.7181306 1 1.894421 3.8850189e-06 36 116 0.31034482 0 122 HIADLAGNSEVILPVPAFNVINGGSHAGNK HIADLAGNSEVILPVPAFNVINGGSHAGNK sp|P06733|ENOA_HUMAN(133) RL +/hdd/data/PXD005590/B02_20_161103_E4_HCD_OT_4ul.raw.mzXML 19752 3 3220.75 827.3577 2479.0515 2479.0457 0.85362118 5.682076 1 1.8917845 3.8850189e-06 32 92 0.34782609 0 52 QDHPSSMGVYGQESGGFSGPGENR QDHPSSMGVYGQESGGFSGPGENR sp|Q01844|EWS_HUMAN(269) RS diff --git a/mzLib/Test/FileReadingTests/TestCruxReader.cs b/mzLib/Test/FileReadingTests/TestCruxReader.cs new file mode 100644 index 000000000..38b800c7b --- /dev/null +++ b/mzLib/Test/FileReadingTests/TestCruxReader.cs @@ -0,0 +1,145 @@ +using NUnit.Framework; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using Newtonsoft.Json; +using Readers; + +namespace Test.FileReadingTests +{ + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestCruxReader + { + private static string directoryPath; + + [OneTimeSetUp] + public void SetUp() + { + directoryPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ReadingWritingTests"); + Directory.CreateDirectory(directoryPath); + } + + [OneTimeTearDown] + public void TearDown() + { + Directory.Delete(directoryPath, true); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\crux.txt", 14)] + public void TestCruxResultsLoadsAndCountCorrect(string path, int recordCount) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + CruxResultFile file = new CruxResultFile(filePath); + Assert.That(file.Count(), Is.EqualTo(recordCount)); + Assert.That(file.CanRead(path)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\crux.txt", 14)] + public static void TestCruxResultsFromGenericReader(string path, int recordCount) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + var constructedFile = new CruxResultFile(filePath); + var genericFile = FileReader.ReadFile(filePath); + + Assert.That(genericFile.Count(), Is.EqualTo(recordCount)); + Assert.That(genericFile.Count(), Is.EqualTo(constructedFile.Count())); + Assert.That(genericFile.FilePath, Is.EqualTo(constructedFile.FilePath)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\crux.txt")] + public void TestCruxResultsFirstAndLastAreCorrect(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + var file = new CruxResultFile(filePath); + + var first = file.First(); + var last = file.Last(); + + Assert.That(first.FilePath, Is.EqualTo(@"/hdd/data/PXD005590/B02_21_161103_D4_HCD_OT_4ul.raw.mzXML")); + Assert.That(first.OneBasedScanNumber, Is.EqualTo(14674)); + Assert.That(first.Charge, Is.EqualTo(3)); + Assert.That(first.RetentionTime, Is.EqualTo(2747.6599)); + Assert.That(first.PrecursorMz, Is.EqualTo(1075.1815)); + Assert.That(first.NeutralMass, Is.EqualTo(3222.5227)); + Assert.That(first.PeptideMass, Is.EqualTo(3222.5222)); + Assert.That(first.DeltaCn, Is.EqualTo(0.84335566)); + Assert.That(first.XCorrScore, Is.EqualTo(6.4364114)); + Assert.That(first.XCorrRank, Is.EqualTo(1)); + Assert.That(first.TailorScore, Is.EqualTo(1.9659604)); + Assert.That(first.TdcQValue, Is.EqualTo(0.0000038850189).Within(1E-6)); + Assert.That(first.BAndYIonsMatched, Is.EqualTo(51)); + Assert.That(first.BAndYIonsTotal, Is.EqualTo(116)); + Assert.That(first.BAndYIonsFraction, Is.EqualTo(0.43965518)); + Assert.That(first.BAndYIonRepeatMatch, Is.EqualTo(0)); + Assert.That(first.BaseSequence, Is.EqualTo("RPQYSNPPVQGEVMEGADNQGAGEQGRPVR")); + Assert.That(first.FullSequence, Is.EqualTo("RPQYSNPPVQGEVMEGADNQGAGEQGRPVR")); + Assert.That(first.ProteinId, Is.EqualTo("sp|P67809|YBOX1_HUMAN(205)")); + Assert.That(first.FlankingAa, Is.EqualTo("RQ")); + Assert.That(first.FileNameWithoutExtension, Is.EqualTo("B02_21_161103_D4_HCD_OT_4ul.raw")); + Assert.That(first.Accession, Is.EqualTo("P67809")); + + Assert.That(last.FilePath, Is.EqualTo(@"/hdd/data/PXD005590/B02_20_161103_E4_HCD_OT_4ul.raw.mzXML")); + Assert.That(last.OneBasedScanNumber, Is.EqualTo(19752)); + Assert.That(last.Charge, Is.EqualTo(3)); + Assert.That(last.RetentionTime, Is.EqualTo(3220.75)); + Assert.That(last.PrecursorMz, Is.EqualTo(827.3577)); + Assert.That(last.NeutralMass, Is.EqualTo(2479.0515)); + Assert.That(last.PeptideMass, Is.EqualTo(2479.0457)); + Assert.That(last.DeltaCn, Is.EqualTo(0.85362118)); + Assert.That(last.XCorrScore, Is.EqualTo(5.682076)); + Assert.That(last.XCorrRank, Is.EqualTo(1)); + Assert.That(last.TailorScore, Is.EqualTo(1.8917845)); + Assert.That(last.TdcQValue, Is.EqualTo(0.00000388501896).Within(1E-6)); + Assert.That(last.BAndYIonsMatched, Is.EqualTo(32)); + Assert.That(last.BAndYIonsTotal, Is.EqualTo(92)); + Assert.That(last.BAndYIonsFraction, Is.EqualTo(0.34782609)); + Assert.That(last.BAndYIonRepeatMatch, Is.EqualTo(0)); + Assert.That(last.BaseSequence, Is.EqualTo("QDHPSSMGVYGQESGGFSGPGENR")); + Assert.That(last.FullSequence, Is.EqualTo("QDHPSSMGVYGQESGGFSGPGENR")); + Assert.That(last.ProteinId, Is.EqualTo("sp|Q01844|EWS_HUMAN(269)")); + Assert.That(last.FlankingAa, Is.EqualTo("RS")); + Assert.That(last.FileNameWithoutExtension, Is.EqualTo("B02_20_161103_E4_HCD_OT_4ul.raw")); + Assert.That(last.Accession, Is.EqualTo("Q01844")); + } + + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\crux.txt")] + public void TestCruxResultsWriteResults(string path) + { + // load in original + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + var original = new CruxResultFile(filePath); + + // write out original + var outputPath = Path.Combine(directoryPath, "cruxResults.csv"); + original.WriteResults(outputPath); + Assert.That(File.Exists(outputPath)); + + // read in new original + var written = new CruxResultFile(outputPath); + Assert.That(written.Count(), Is.EqualTo(original.Count())); + + // check are equivalent + for (int i = 0; i < original.Count(); i++) + { + var oldRecord = JsonConvert.SerializeObject(original.Results[i]); + var newRecord = JsonConvert.SerializeObject(written.Results[i]); + Assert.That(oldRecord, Is.EqualTo(newRecord)); + } + + // test writer still works without specifying extensions + var outputPathWithoutExtension = Path.Combine(directoryPath, "cruxResults"); + original.WriteResults(outputPathWithoutExtension); + Assert.That(File.Exists(outputPathWithoutExtension + ".csv")); + + var writtenWithoutExtension = new CruxResultFile(outputPathWithoutExtension + ".csv"); + Assert.That(writtenWithoutExtension.Count(), Is.EqualTo(original.Count())); + } + } +} diff --git a/mzLib/Test/FileReadingTests/TestMgf.cs b/mzLib/Test/FileReadingTests/TestMgf.cs index 5c1d7568b..05251365c 100644 --- a/mzLib/Test/FileReadingTests/TestMgf.cs +++ b/mzLib/Test/FileReadingTests/TestMgf.cs @@ -3,6 +3,7 @@ using System.Linq; using MassSpectrometry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Readers; using MzLibUtil; using Stopwatch = System.Diagnostics.Stopwatch; diff --git a/mzLib/Test/FileReadingTests/TestMsDataFile.cs b/mzLib/Test/FileReadingTests/TestMsDataFile.cs index eed172009..5977d4b71 100644 --- a/mzLib/Test/FileReadingTests/TestMsDataFile.cs +++ b/mzLib/Test/FileReadingTests/TestMsDataFile.cs @@ -21,7 +21,8 @@ using MassSpectrometry; using MzLibUtil; using NUnit.Framework; -using Proteomics; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Proteomics.AminoAcidPolymer; using Proteomics.ProteolyticDigestion; using System; diff --git a/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs b/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs index 06f63372e..765bd5c1c 100644 --- a/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs +++ b/mzLib/Test/FileReadingTests/TestMsFraggerResultFiles.cs @@ -45,6 +45,21 @@ public void TestMsFraggerPsmLoadsAndCountCorrect(string path, int count) Assert.That(file.CanRead(path)); } + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")] + public void TestAddProteinGroupInfoCountCorrect (string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerPsmFile file = new MsFraggerPsmFile(filePath); + var allResults = file.ToList(); + + // one protein associated with given results, list should only contain this one element + Assert.That(allResults[0].ProteinGroupInfos.Count, Is.EqualTo(1)); + // two proteins associated with given results, list should contain two elements + Assert.That(allResults[2].ProteinGroupInfos.Count, Is.EqualTo(2)); + + } + [Test] [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPeptide_FragPipev21.1individual_peptide.tsv", 7)] [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPeptide_FragPipev21.1combined_peptide.tsv", 6)] diff --git a/mzLib/Test/FileReadingTests/TestMsPathFinderT.cs b/mzLib/Test/FileReadingTests/TestMsPathFinderT.cs new file mode 100644 index 000000000..1328609a9 --- /dev/null +++ b/mzLib/Test/FileReadingTests/TestMsPathFinderT.cs @@ -0,0 +1,254 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Newtonsoft.Json; +using Readers; + +namespace Test.FileReadingTests +{ + [ExcludeFromCodeCoverage] + public class TestMsPathFinderT + { + private static string directoryPath; + + [OneTimeSetUp] + public void SetUp() + { + directoryPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ReadingWritingTests"); + Directory.CreateDirectory(directoryPath); + } + + [OneTimeTearDown] + public void TearDown() + { + Directory.Delete(directoryPath, true); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_TargetResults_IcTarget.tsv", 6)] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_DecoyResults_IcDecoy.tsv", 6)] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_AllResults_IcTda.tsv", 6)] + public void TestMsPathFinderTLoadsAndCountCorrect(string path, int count) + { + var filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsPathFinderTResultFile file = new MsPathFinderTResultFile(filePath); + Assert.That(file.Count(), Is.EqualTo(count)); + Assert.That(file.CanRead(path)); + } + + [Test] + public static void TestMsPathFinderTAllResultsFirstAndLastCorrect() + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ExternalFileTypes\MsPathFinderT_AllResults_IcTda.tsv"); + MsPathFinderTResultFile file = new MsPathFinderTResultFile(filePath); + var first = file.First(); + var last = file.Last(); + + Assert.That(first.OneBasedScanNumber, Is.EqualTo(1180)); + Assert.That(first.PreviousResidue, Is.EqualTo('M')); + Assert.That(first.BaseSequence, Is.EqualTo("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD")); + Assert.That(first.NextResidue, Is.EqualTo('-')); + Assert.That(first.Modifications, Is.Empty); + Assert.That(first.ChemicalFormula.Formula, Is.EqualTo("C442H756N140O156")); + Assert.That(first.ProteinName, Is.EqualTo("sp|P05114|HMGN1_HUMAN")); + Assert.That(first.ProteinDescription, Is.EqualTo("Non-histone chromosomal protein HMG-14 OS=Homo sapiens OX=9606 GN=HMGN1 PE=1 SV=3")); + Assert.That(first.Length, Is.EqualTo(101)); + Assert.That(first.OneBasedStartResidue, Is.EqualTo(2)); + Assert.That(first.OneBasedEndResidue, Is.EqualTo(100)); + Assert.That(first.Charge, Is.EqualTo(15)); + Assert.That(first.MostAbundantIsotopeMz, Is.EqualTo(702.8454697)); + Assert.That(first.MonoisotopicMass, Is.EqualTo(10521.55277)); + Assert.That(first.Ms1Features, Is.EqualTo(0)); + Assert.That(first.NumberOfMatchedFragments, Is.EqualTo(61)); + Assert.That(first.Probability, Is.EqualTo(1.0)); + Assert.That(first.SpecEValue, Is.EqualTo(9.99E-308)); + Assert.That(first.EValue, Is.EqualTo(9.99E-308)); + Assert.That(first.QValue, Is.EqualTo(0)); + Assert.That(first.PepQValue, Is.EqualTo(0)); + Assert.That(first.Accession, Is.EqualTo("P05114")); + Assert.That(first.IsDecoy, Is.EqualTo(false)); + Assert.That(first.FileNameWithoutExtension, Is.EqualTo("MsPathFinderT_AllResults")); + + Assert.That(last.OneBasedScanNumber, Is.EqualTo(1181)); + Assert.That(last.PreviousResidue, Is.EqualTo('M')); + Assert.That(last.BaseSequence, Is.EqualTo("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD")); + Assert.That(last.NextResidue, Is.EqualTo('-')); + Assert.That(last.Modifications, Is.Empty); + Assert.That(last.ChemicalFormula.Formula, Is.EqualTo("C442H756N140O156")); + Assert.That(last.ProteinName, Is.EqualTo("sp|P05114|HMGN1_HUMAN")); + Assert.That(last.ProteinDescription, Is.EqualTo("Non-histone chromosomal protein HMG-14 OS=Homo sapiens OX=9606 GN=HMGN1 PE=1 SV=3")); + Assert.That(last.Length, Is.EqualTo(101)); + Assert.That(last.OneBasedStartResidue, Is.EqualTo(2)); + Assert.That(last.OneBasedEndResidue, Is.EqualTo(100)); + Assert.That(last.Charge, Is.EqualTo(14)); + Assert.That(last.MostAbundantIsotopeMz, Is.EqualTo(752.9767692)); + Assert.That(last.MonoisotopicMass, Is.EqualTo(10521.55277)); + Assert.That(last.Ms1Features, Is.EqualTo(0)); + Assert.That(last.NumberOfMatchedFragments, Is.EqualTo(60)); + Assert.That(last.Probability, Is.EqualTo(1.0)); + Assert.That(last.SpecEValue, Is.EqualTo(9.99E-308)); + Assert.That(last.EValue, Is.EqualTo(9.99E-308)); + Assert.That(last.QValue, Is.EqualTo(0)); + Assert.That(last.PepQValue, Is.EqualTo(0)); + Assert.That(last.Accession, Is.EqualTo("P05114")); + Assert.That(last.IsDecoy, Is.EqualTo(false)); + Assert.That(last.FileNameWithoutExtension, Is.EqualTo("MsPathFinderT_AllResults")); + } + + [Test] + public static void TestMsPathFinderTTargetResultsFirstAndLastAreCorrect() + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ExternalFileTypes\MsPathFinderT_TargetResults_IcTarget.tsv"); + MsPathFinderTResultFile file = new MsPathFinderTResultFile(filePath); + var first = file.First(); + var last = file.Last(); + + Assert.That(first.OneBasedScanNumber, Is.EqualTo(592)); + Assert.That(first.PreviousResidue, Is.EqualTo('M')); + Assert.That(first.BaseSequence, Is.EqualTo("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK")); + Assert.That(first.NextResidue, Is.EqualTo('-')); + Assert.That(first.Modifications, Is.Empty); + Assert.That(first.ChemicalFormula.Formula, Is.EqualTo("C395H673N129O127")); + Assert.That(first.ProteinName, Is.EqualTo("sp|P05204|HMGN2_HUMAN")); + Assert.That(first.ProteinDescription, Is.EqualTo("Non-histone chromosomal protein HMG-17 OS=Homo sapiens OX=9606 GN=HMGN2 PE=1 SV=3")); + Assert.That(first.Length, Is.EqualTo(91)); + Assert.That(first.OneBasedStartResidue, Is.EqualTo(2)); + Assert.That(first.OneBasedEndResidue, Is.EqualTo(90)); + Assert.That(first.Charge, Is.EqualTo(14)); + Assert.That(first.MostAbundantIsotopeMz, Is.EqualTo(662.5096855)); + Assert.That(first.MonoisotopicMass, Is.EqualTo(9256.016953)); + Assert.That(first.Ms1Features, Is.EqualTo(531)); + Assert.That(first.NumberOfMatchedFragments, Is.EqualTo(17)); + Assert.That(first.Probability, Is.EqualTo(0.9999)); + Assert.That(first.SpecEValue, Is.EqualTo(1.642469E-36)); + Assert.That(first.EValue, Is.EqualTo(6.258791E-32)); + Assert.That(first.QValue, Is.EqualTo(0)); + Assert.That(first.PepQValue, Is.EqualTo(0)); + Assert.That(first.Accession, Is.EqualTo("P05204")); + Assert.That(first.IsDecoy, Is.EqualTo(false)); + + Assert.That(last.OneBasedScanNumber, Is.EqualTo(1169)); + Assert.That(last.PreviousResidue, Is.EqualTo('M')); + Assert.That(last.BaseSequence, Is.EqualTo("PKRKSPENTEGKDGSKVTKQEPTRRSARLSAKPAPPKPEPKPRKTSAKKEPGAKISRGAKGKKEEKQEAGKEGTAPSENGETKAEEAQKTESVDNEGE")); + Assert.That(last.NextResidue, Is.EqualTo('-')); + Assert.That(last.Modifications, Is.Empty); + Assert.That(last.ChemicalFormula.Formula, Is.EqualTo("C442H749N141O156")); + Assert.That(last.ProteinName, Is.EqualTo("sp|Q15651|HMGN3_HUMAN")); + Assert.That(last.ProteinDescription, Is.EqualTo("High mobility group nucleosome-binding domain-containing protein 3 OS=Homo sapiens OX=9606 GN=HMGN3 PE=1 SV=2")); + Assert.That(last.Length, Is.EqualTo(100)); + Assert.That(last.OneBasedStartResidue, Is.EqualTo(2)); + Assert.That(last.OneBasedEndResidue, Is.EqualTo(99)); + Assert.That(last.Charge, Is.EqualTo(15)); + Assert.That(last.MostAbundantIsotopeMz, Is.EqualTo(703.3086896)); + Assert.That(last.MonoisotopicMass, Is.EqualTo(10528.50107)); + Assert.That(last.Ms1Features, Is.EqualTo(666)); + Assert.That(last.NumberOfMatchedFragments, Is.EqualTo(7)); + Assert.That(last.Probability, Is.EqualTo(0.161)); + Assert.That(last.SpecEValue, Is.EqualTo(3.706717E-14)); + Assert.That(last.EValue, Is.EqualTo(1.412482E-09)); + Assert.That(last.QValue, Is.EqualTo(0)); + Assert.That(last.PepQValue, Is.EqualTo(0)); + Assert.That(last.Accession, Is.EqualTo("Q15651")); + Assert.That(last.IsDecoy, Is.EqualTo(false)); + } + + [Test] + public static void TestMsPathFinderTDecoysFirstAndLastIsCorrect() + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ExternalFileTypes\MsPathFinderT_DecoyResults_IcDecoy.tsv"); + MsPathFinderTResultFile file = new MsPathFinderTResultFile(filePath); + var first = file.First(); + var last = file.Last(); + + Assert.That(first.OneBasedScanNumber, Is.EqualTo(1180)); + Assert.That(first.PreviousResidue, Is.EqualTo('-')); + Assert.That(first.BaseSequence, Is.EqualTo("EKTEDKAQDTVAITQIFINGQYGEDAHKDYERGSDRIKKYKVRSGKKLEGTMDEYNIISAKKNVHKATKNRQKYNKINELIGQSNLYSRGF")); + Assert.That(first.NextResidue, Is.EqualTo('T')); + Assert.That(first.Modifications, Is.Empty); + Assert.That(first.ChemicalFormula.Formula, Is.EqualTo("C460H740N136O146S")); + Assert.That(first.ProteinName, Is.EqualTo("XXX_sp|Q2G1S6|SSL5_STAA8")); + Assert.That(first.ProteinDescription, Is.EqualTo("Staphylococcal superantigen-like 5 OS=Staphylococcus aureus (strain NCTC 8325")); + Assert.That(first.Length, Is.EqualTo(235)); + Assert.That(first.OneBasedStartResidue, Is.EqualTo(1)); + Assert.That(first.OneBasedEndResidue, Is.EqualTo(91)); + Assert.That(first.Charge, Is.EqualTo(15)); + Assert.That(first.MostAbundantIsotopeMz, Is.EqualTo(703.9044982)); + Assert.That(first.MonoisotopicMass, Is.EqualTo(10537.4382)); + Assert.That(first.Ms1Features, Is.EqualTo(670)); + Assert.That(first.NumberOfMatchedFragments, Is.EqualTo(5)); + Assert.That(first.Probability, Is.EqualTo(0.0017)); + Assert.That(first.SpecEValue, Is.EqualTo(2.399925E-07)); + Assert.That(first.EValue, Is.EqualTo(0.009145)); + Assert.That(first.QValue, Is.EqualTo(0)); + Assert.That(first.PepQValue, Is.EqualTo(0)); + Assert.That(first.Accession, Is.EqualTo("Q2G1S6")); + Assert.That(first.IsDecoy, Is.EqualTo(true)); + + Assert.That(last.OneBasedScanNumber, Is.EqualTo(1181)); + Assert.That(last.PreviousResidue, Is.EqualTo('N')); + Assert.That(last.BaseSequence, Is.EqualTo("PAELTKRVQLDMVCRIKEPLRSVKPLPSLAAPSVSSPQLASKSAATKVSMSFSAYMLKMISPVNRAQRCSPQSYRTHVTSPMKRSAVVSEYLSHDP")); + Assert.That(last.NextResidue, Is.EqualTo('T')); + Assert.That(last.Modifications, Is.Empty); + Assert.That(last.ChemicalFormula.Formula, Is.EqualTo("C459H763N133O136S7")); + Assert.That(last.ProteinName, Is.EqualTo("XXX_sp|Q9PTQ7|DMRT1_CHICK")); + Assert.That(last.ProteinDescription, Is.EqualTo("Doublesex- and mab-3-related transcription factor 1 OS=Gallus gallus OX=9031 GN=DMRT1 PE=2 SV=2")); + Assert.That(last.Length, Is.EqualTo(366)); + Assert.That(last.OneBasedStartResidue, Is.EqualTo(2)); + Assert.That(last.OneBasedEndResidue, Is.EqualTo(97)); + Assert.That(last.Charge, Is.EqualTo(14)); + Assert.That(last.MostAbundantIsotopeMz, Is.EqualTo(754.1867306)); + Assert.That(last.MonoisotopicMass, Is.EqualTo(10538.49223)); + Assert.That(last.Ms1Features, Is.EqualTo(671)); + Assert.That(last.NumberOfMatchedFragments, Is.EqualTo(8)); + Assert.That(last.Probability, Is.EqualTo(1.8379E-04)); + Assert.That(last.SpecEValue, Is.EqualTo(2.215391E-06)); + Assert.That(last.EValue, Is.EqualTo(0.08442)); + Assert.That(last.QValue, Is.EqualTo(0)); + Assert.That(last.PepQValue, Is.EqualTo(0)); + Assert.That(last.Accession, Is.EqualTo("Q9PTQ7")); + Assert.That(last.IsDecoy, Is.EqualTo(true)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_DecoyResults_IcDecoy.tsv", 2)] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_TargetResults_IcTarget.tsv", 1)] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_AllResults_IcTda.tsv", 3)] + public static void TestMsPathFinderTAllResultsReadWrite(string path, int fileNum) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + string outpath = Path.Combine(directoryPath, $"MsPathFinderT_Out{fileNum}"); + + MsPathFinderTResultFile file = new MsPathFinderTResultFile(filePath); + file.WriteResults(outpath); + + MsPathFinderTResultFile outFile = new MsPathFinderTResultFile(outpath + path.ParseFileType().GetFileExtension()); + Assert.That(outFile.Count(), Is.EqualTo(file.Count())); + for (int i = 0; i < file.Count(); i++) + { + var original = System.Text.Json.JsonSerializer.Serialize(file.ElementAt(i)); + var written = System.Text.Json.JsonSerializer.Serialize(outFile.ElementAt(i)); + Assert.That(original, Is.EqualTo(written)); + } + + MsPathFinderTResultFile file2 = FileReader.ReadFile(filePath); + Assert.That(file2.Count(), Is.EqualTo(file.Count())); + for (int i = 0; i < file2.Count(); i++) + { + var original = System.Text.Json.JsonSerializer.Serialize(file.ElementAt(i)); + var read = System.Text.Json.JsonSerializer.Serialize(file2.ElementAt(i)); + Assert.That(original, Is.EqualTo(read)); + } + } + + + } +} diff --git a/mzLib/Test/FileReadingTests/TestMzML.cs b/mzLib/Test/FileReadingTests/TestMzML.cs index c20e13255..826efca77 100644 --- a/mzLib/Test/FileReadingTests/TestMzML.cs +++ b/mzLib/Test/FileReadingTests/TestMzML.cs @@ -9,6 +9,7 @@ using MzIdentML; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics.AminoAcidPolymer; using Readers; using Stopwatch = System.Diagnostics.Stopwatch; diff --git a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs index c498c9759..2018158b1 100644 --- a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs +++ b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs @@ -1,5 +1,5 @@ using NUnit.Framework; -using Proteomics.ProteolyticDigestion; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics.PSM; using System; using System.Collections.Generic; diff --git a/mzLib/Test/FileReadingTests/TestQuantifiedPeak.cs b/mzLib/Test/FileReadingTests/TestQuantifiedPeak.cs new file mode 100644 index 000000000..0d301f096 --- /dev/null +++ b/mzLib/Test/FileReadingTests/TestQuantifiedPeak.cs @@ -0,0 +1,185 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Readers; +using Readers.QuantificationResults; + +namespace Test.FileReadingTests +{ + [ExcludeFromCodeCoverage] + internal class TestQuantifiedPeak + { + internal static string TestDirectory; + internal static string TestFilePath; + + [OneTimeSetUp] + public void SetUp() + { + TestDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ReadingWritingTests"); + TestFilePath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ExternalFileTypes\FlashLFQ_MzLib1.0.549_QuantifiedPeaks.tsv"); + Directory.CreateDirectory(TestDirectory); + } + + [OneTimeTearDown] + public void TearDown() + { + Directory.Delete(TestDirectory, true); + } + + [Test] + public static void TestFileLoadsAndCountCorrect() + { + QuantifiedPeakFile file = new QuantifiedPeakFile(TestFilePath); + Assert.That(file.Count(), Is.EqualTo(4)); + Assert.That(file.CanRead(TestFilePath)); + + file = FileReader.ReadFile(TestFilePath); + Assert.That(file.Count(), Is.EqualTo(4)); + Assert.That(file.CanRead(TestFilePath)); + } + + [Test] + public static void TestFileFirstAndLastAreCorrect() + { + QuantifiedPeakFile file = new QuantifiedPeakFile(TestFilePath); + var first = file.First(); + + Assert.That(first.FileName, Is.EqualTo("20100721_Velos1_TaGe_SA_A549_06-calib-averaged")); + Assert.That(first.BaseSequence, Is.EqualTo("DICNDVLSLLEK")); + Assert.That(first.FullSequence, Is.EqualTo("DIC[Common Fixed:Carbamidomethyl on C]NDVLSLLEK")); + Assert.That(first.ProteinGroup, Is.EqualTo("P63104")); + Assert.That(first.PeptideMonoisotopicMass, Is.EqualTo(1417.712281191)); + Assert.That(first.MS2RetentionTime, Is.EqualTo(188.04623)); + Assert.That(first.PrecursorCharge, Is.EqualTo(3)); + Assert.That(first.TheoreticalMZ, Is.EqualTo(473.578036863879)); + Assert.That(first.PeakIntensity, Is.EqualTo(61339740.974156484)); + Assert.That(first.PeakRTStart, Is.EqualTo(187.71813)); + Assert.That(first.PeakRTApex, Is.EqualTo(188.27129333333335)); + Assert.That(first.PeakRTEnd, Is.EqualTo(195.134625)); + Assert.That(first.PeakMz, Is.EqualTo(709.8649279277056)); + Assert.That(first.PeakCharge, Is.EqualTo(2)); + Assert.That(first.NumChargeStatesObserved, Is.EqualTo(3)); + Assert.That(first.PeakDetectionType, Is.EqualTo("MSMS")); + Assert.That(first.MBRScore, Is.EqualTo(0)); + Assert.That(first.PSMsMapped, Is.EqualTo(2)); + Assert.That(first.BaseSequencesMapped, Is.EqualTo(1)); + Assert.That(first.FullSequencesMapped, Is.EqualTo(1)); + Assert.That(first.PeakSplitValleyRT, Is.EqualTo(0)); + Assert.That(first.PeakApexMassError, Is.EqualTo(2.1314131880687888)); + + var last = file.Last(); + Assert.That(last.FileName, Is.EqualTo("20101230_Velos1_TaGe_SA_Jurkat6-calib-averaged")); + Assert.That(last.BaseSequence, Is.EqualTo("QDLEAQIRGLREEVEK")); + Assert.That(last.FullSequence, Is.EqualTo("QDLEAQIRGLREEVEK")); + Assert.That(last.ProteinGroup, Is.EqualTo("A1A5D9")); + Assert.That(last.PeptideMonoisotopicMass, Is.EqualTo(1912.001403494)); + Assert.That(last.MS2RetentionTime, Is.EqualTo(71.64922)); + Assert.That(last.PrecursorCharge, Is.EqualTo(4)); + Assert.That(last.TheoreticalMZ, Is.EqualTo(479.007627340379)); + Assert.That(last.PeakIntensity, Is.EqualTo(0)); + Assert.That(last.PeakRTStart, Is.Null); + Assert.That(last.PeakRTApex, Is.Null); + Assert.That(last.PeakRTEnd, Is.Null); + Assert.That(last.PeakMz, Is.Null); + Assert.That(last.PeakCharge, Is.Null); + Assert.That(last.NumChargeStatesObserved, Is.EqualTo(0)); + Assert.That(last.PeakDetectionType, Is.EqualTo("MSMS")); + Assert.That(last.MBRScore, Is.EqualTo(0)); + Assert.That(last.PSMsMapped, Is.EqualTo(1)); + Assert.That(last.BaseSequencesMapped, Is.EqualTo(1)); + Assert.That(last.FullSequencesMapped, Is.EqualTo(1)); + Assert.That(last.PeakSplitValleyRT, Is.EqualTo(0)); + Assert.That(last.PeakApexMassError, Is.EqualTo(double.NaN)); + } + + [Test] + public static void TestFileReadWrite_WithoutExtensionInPath() + { + var file = FileReader.ReadFile(TestFilePath); + var testOutputPath = Path.Combine(TestDirectory, "TestOutput"); + + file.WriteResults(testOutputPath); + var newPath = testOutputPath + file.FileType.GetFileExtension(); + Assert.That(File.Exists(newPath)); + + var writtenFile = new QuantifiedPeakFile(newPath); + Assert.That(file.Count(), Is.EqualTo(writtenFile.Count())); + + for (int i = 0; i < file.Count(); i++) + { + var originalPeak = file.Results[i]; + var writtenPeak = writtenFile.Results[i]; + Assert.That(originalPeak.FileName, Is.EqualTo(writtenPeak.FileName)); + Assert.That(originalPeak.BaseSequence, Is.EqualTo(writtenPeak.BaseSequence)); + Assert.That(originalPeak.FullSequence, Is.EqualTo(writtenPeak.FullSequence)); + Assert.That(originalPeak.ProteinGroup, Is.EqualTo(writtenPeak.ProteinGroup)); + Assert.That(originalPeak.PeptideMonoisotopicMass, Is.EqualTo(writtenPeak.PeptideMonoisotopicMass).Within(0.0000001)); + Assert.That(originalPeak.MS2RetentionTime, Is.EqualTo(writtenPeak.MS2RetentionTime).Within(0.0000001)); + Assert.That(originalPeak.PrecursorCharge, Is.EqualTo(writtenPeak.PrecursorCharge)); + Assert.That(originalPeak.TheoreticalMZ, Is.EqualTo(writtenPeak.TheoreticalMZ).Within(0.0000001)); + Assert.That(originalPeak.PeakIntensity, Is.EqualTo(writtenPeak.PeakIntensity).Within(0.0000001)); + Assert.That(originalPeak.PeakRTStart, Is.EqualTo(writtenPeak.PeakRTStart).Within(0.0000001)); + Assert.That(originalPeak.PeakRTApex, Is.EqualTo(writtenPeak.PeakRTApex).Within(0.0000001)); + Assert.That(originalPeak.PeakRTEnd, Is.EqualTo(writtenPeak.PeakRTEnd).Within(0.0000001)); + Assert.That(originalPeak.PeakMz, Is.EqualTo(writtenPeak.PeakMz).Within(0.0000001)); + Assert.That(originalPeak.PeakCharge, Is.EqualTo(writtenPeak.PeakCharge)); + Assert.That(originalPeak.NumChargeStatesObserved, Is.EqualTo(writtenPeak.NumChargeStatesObserved)); + Assert.That(originalPeak.PeakDetectionType, Is.EqualTo(writtenPeak.PeakDetectionType)); + Assert.That(originalPeak.MBRScore, Is.EqualTo(writtenPeak.MBRScore).Within(0.0000001)); + Assert.That(originalPeak.PSMsMapped, Is.EqualTo(writtenPeak.PSMsMapped)); + Assert.That(originalPeak.BaseSequencesMapped, Is.EqualTo(writtenPeak.BaseSequencesMapped)); + Assert.That(originalPeak.FullSequencesMapped, Is.EqualTo(writtenPeak.FullSequencesMapped)); + Assert.That(originalPeak.PeakSplitValleyRT, Is.EqualTo(writtenPeak.PeakSplitValleyRT)); + Assert.That(originalPeak.PeakApexMassError, Is.EqualTo(writtenPeak.PeakApexMassError).Within(0.0000001)); + } + } + + [Test] + public static void TestFileReadWrite_WithExtensionInPath() + { + var file = FileReader.ReadFile(TestFilePath); + var testOutputPath = Path.Combine(TestDirectory, "TestOutput_QuantifiedPeaks.tsv"); + + file.WriteResults(testOutputPath); + Assert.That(File.Exists(testOutputPath)); + + var writtenFile = new QuantifiedPeakFile(testOutputPath); + Assert.That(file.Count(), Is.EqualTo(writtenFile.Count())); + + for (int i = 0; i < file.Count(); i++) + { + var originalPeak = file.Results[i]; + var writtenPeak = writtenFile.Results[i]; + Assert.That(originalPeak.FileName, Is.EqualTo(writtenPeak.FileName)); + Assert.That(originalPeak.BaseSequence, Is.EqualTo(writtenPeak.BaseSequence)); + Assert.That(originalPeak.FullSequence, Is.EqualTo(writtenPeak.FullSequence)); + Assert.That(originalPeak.ProteinGroup, Is.EqualTo(writtenPeak.ProteinGroup)); + Assert.That(originalPeak.PeptideMonoisotopicMass, Is.EqualTo(writtenPeak.PeptideMonoisotopicMass).Within(0.0000001)); + Assert.That(originalPeak.MS2RetentionTime, Is.EqualTo(writtenPeak.MS2RetentionTime).Within(0.0000001)); + Assert.That(originalPeak.PrecursorCharge, Is.EqualTo(writtenPeak.PrecursorCharge)); + Assert.That(originalPeak.TheoreticalMZ, Is.EqualTo(writtenPeak.TheoreticalMZ).Within(0.0000001)); + Assert.That(originalPeak.PeakIntensity, Is.EqualTo(writtenPeak.PeakIntensity).Within(0.0000001)); + Assert.That(originalPeak.PeakRTStart, Is.EqualTo(writtenPeak.PeakRTStart).Within(0.0000001)); + Assert.That(originalPeak.PeakRTApex, Is.EqualTo(writtenPeak.PeakRTApex).Within(0.0000001)); + Assert.That(originalPeak.PeakRTEnd, Is.EqualTo(writtenPeak.PeakRTEnd).Within(0.0000001)); + Assert.That(originalPeak.PeakMz, Is.EqualTo(writtenPeak.PeakMz).Within(0.0000001)); + Assert.That(originalPeak.PeakCharge, Is.EqualTo(writtenPeak.PeakCharge)); + Assert.That(originalPeak.NumChargeStatesObserved, Is.EqualTo(writtenPeak.NumChargeStatesObserved)); + Assert.That(originalPeak.PeakDetectionType, Is.EqualTo(writtenPeak.PeakDetectionType)); + Assert.That(originalPeak.MBRScore, Is.EqualTo(writtenPeak.MBRScore).Within(0.0000001)); + Assert.That(originalPeak.PSMsMapped, Is.EqualTo(writtenPeak.PSMsMapped)); + Assert.That(originalPeak.BaseSequencesMapped, Is.EqualTo(writtenPeak.BaseSequencesMapped)); + Assert.That(originalPeak.FullSequencesMapped, Is.EqualTo(writtenPeak.FullSequencesMapped)); + Assert.That(originalPeak.PeakSplitValleyRT, Is.EqualTo(writtenPeak.PeakSplitValleyRT)); + Assert.That(originalPeak.PeakApexMassError, Is.EqualTo(writtenPeak.PeakApexMassError).Within(0.0000001)); + } + } + } +} diff --git a/mzLib/Test/FileReadingTests/TestRawFileReader.cs b/mzLib/Test/FileReadingTests/TestRawFileReader.cs index d0d2665d3..0b67c1508 100644 --- a/mzLib/Test/FileReadingTests/TestRawFileReader.cs +++ b/mzLib/Test/FileReadingTests/TestRawFileReader.cs @@ -1,12 +1,10 @@ using System; -using System.Collections.Generic; -using System.Collections.Specialized; using System.Diagnostics; using System.IO; using System.Linq; using MassSpectrometry; -using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Readers; namespace Test.FileReadingTests diff --git a/mzLib/Test/FileReadingTests/TestSupportedFileExtensions.cs b/mzLib/Test/FileReadingTests/TestSupportedFileExtensions.cs index 6efc9d96c..3c22e327b 100644 --- a/mzLib/Test/FileReadingTests/TestSupportedFileExtensions.cs +++ b/mzLib/Test/FileReadingTests/TestSupportedFileExtensions.cs @@ -32,7 +32,12 @@ internal class TestSupportedFileExtensions [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerProtein_FragPipev21.1individual_protein.tsv", SupportedFileType.MsFraggerProtein)] [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPeptide_FragPipev21.1combined_peptide.tsv", SupportedFileType.MsFraggerPeptide)] [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerProtein_FragPipev21.1combined_protein.tsv", SupportedFileType.MsFraggerProtein)] - + [TestCase(@"FileReadingTests\ExternalFileTypes\FlashLFQ_MzLib1.0.549_QuantifiedPeaks.tsv", SupportedFileType.FlashLFQQuantifiedPeak)] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_TargetResults_IcTarget.tsv", SupportedFileType.MsPathFinderTTargets)] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_DecoyResults_IcDecoy.tsv", SupportedFileType.MsPathFinderTDecoys)] + [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_AllResults_IcTda.tsv", SupportedFileType.MsPathFinderTAllResults)] + [TestCase(@"FileReadingTests\ExternalFileTypes\crux.txt", SupportedFileType.CruxResult)] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults\experiment_annotation.tsv", SupportedFileType.ExperimentAnnotation)] public static void TestSupportedFileTypeExtensions(string filePath, SupportedFileType expectedType) { var supportedType = filePath.ParseFileType(); @@ -52,6 +57,7 @@ public static void EnsureAllExtensionsAreUnique() [Test] public static void TestSupportedFileTypeExtension_Errors() { + string badTest = "badFile.taco"; Exception e = Assert.Throws(() => badTest.ParseFileType()); Assert.That(e?.Message, Is.EqualTo($"File type not supported")); @@ -88,7 +94,5 @@ public static void TestGetFileExtension_Errors() Assert.That(e?.Message, Is.EqualTo($"File type not supported")); } - - } } diff --git a/mzLib/Test/FileReadingTests/TestToppicResultFiles.cs b/mzLib/Test/FileReadingTests/TestToppicResultFiles.cs index aa1642078..53528e166 100644 --- a/mzLib/Test/FileReadingTests/TestToppicResultFiles.cs +++ b/mzLib/Test/FileReadingTests/TestToppicResultFiles.cs @@ -6,6 +6,7 @@ using MassSpectrometry; using Newtonsoft.Json; using NUnit.Framework; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Readers; namespace Test.FileReadingTests diff --git a/mzLib/Test/SpectrumProcessingAndFiltering.cs b/mzLib/Test/SpectrumProcessingAndFiltering.cs index 3bcaa6d99..433b29020 100644 --- a/mzLib/Test/SpectrumProcessingAndFiltering.cs +++ b/mzLib/Test/SpectrumProcessingAndFiltering.cs @@ -2,6 +2,7 @@ using Readers; using MassSpectrometry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 7b5882009..13f09b29f 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -1,7 +1,7 @@  - net6.0-windows + net8.0-windows false x64 @@ -13,11 +13,12 @@ - - - + + + + - + @@ -287,6 +288,30 @@ Always + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + Always + + + Always + Always @@ -302,6 +327,18 @@ Always + + Always + + + Always + + + Always + + + Always + Always @@ -475,6 +512,24 @@ Always + + + Always + + + PreserveNewest + + + Always + + + Always + + + PreserveNewest + + + PreserveNewest Always diff --git a/mzLib/Test/TestAminoAcids.cs b/mzLib/Test/TestAminoAcids.cs index e16eb671a..acfcb473f 100644 --- a/mzLib/Test/TestAminoAcids.cs +++ b/mzLib/Test/TestAminoAcids.cs @@ -16,8 +16,8 @@ // You should have received a copy of the GNU Lesser General Public // License along with Proteomics. If not, see . -using Chemistry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics.AminoAcidPolymer; using System; using System.Diagnostics.CodeAnalysis; diff --git a/mzLib/Test/TestChemicalFormula.cs b/mzLib/Test/TestChemicalFormula.cs index 124fb7548..483974a1f 100644 --- a/mzLib/Test/TestChemicalFormula.cs +++ b/mzLib/Test/TestChemicalFormula.cs @@ -19,11 +19,11 @@ using Chemistry; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Linq; -using SharpLearning.Containers.Matrices; using Stopwatch = System.Diagnostics.Stopwatch; namespace Test diff --git a/mzLib/Test/TestClassExtensions.cs b/mzLib/Test/TestClassExtensions.cs index 4de0e5ff5..326877386 100644 --- a/mzLib/Test/TestClassExtensions.cs +++ b/mzLib/Test/TestClassExtensions.cs @@ -1,8 +1,7 @@ using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System.Diagnostics.CodeAnalysis; using MzLibUtil; -using System.IO; -using System.Linq; using System; using MassSpectrometry; using System.Collections.Generic; diff --git a/mzLib/Test/TestDeconvolution.cs b/mzLib/Test/TestDeconvolution.cs index 8af70f886..7ee3f59a2 100644 --- a/mzLib/Test/TestDeconvolution.cs +++ b/mzLib/Test/TestDeconvolution.cs @@ -3,6 +3,7 @@ using MassSpectrometry; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics; using Proteomics.ProteolyticDigestion; using System; @@ -11,8 +12,6 @@ using System.Globalization; using System.IO; using System.Linq; -using System.Reflection; -using MassSpectrometry; using Omics.Digestion; using Omics.Modifications; using Test.FileReadingTests; @@ -27,15 +26,18 @@ public sealed class TestDeconvolution #region Old Deconvolution [Test] - [TestCase(586.2143122, 24, 41983672, 586.2)]//This is a lesser abundant charge state envelope at the low mz end - [TestCase(740.372202090153, 19, 108419280, 740.37)]//This is the most abundant charge state envelope - [TestCase(1081.385183, 13, 35454636, 1081.385)]//This is a lesser abundant charge state envelope at the high mz end - public void TestDeconvolutionProteoformMultiChargeState(double selectedIonMz, int selectedIonChargeStateGuess, double selectedIonIntensity, double isolationMz) + [TestCase(586.2143122, 24, 41983672, 586.2)] //This is a lesser abundant charge state envelope at the low mz end + [TestCase(740.372202090153, 19, 108419280, 740.37)] //This is the most abundant charge state envelope + [TestCase(1081.385183, 13, 35454636, + 1081.385)] //This is a lesser abundant charge state envelope at the high mz end + public void TestDeconvolutionProteoformMultiChargeState(double selectedIonMz, int selectedIonChargeStateGuess, + double selectedIonIntensity, double isolationMz) { MsDataScan[] Scans = new MsDataScan[1]; //txt file, not mgf, because it's an MS1. Most intense proteoform has mass of ~14037.9 Da - string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); + string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); string[] spectrumLines = File.ReadAllLines(Ms1SpectrumPath); @@ -52,7 +54,9 @@ public void TestDeconvolutionProteoformMultiChargeState(double selectedIonMz, in MzSpectrum spectrum = new MzSpectrum(ms1mzs, ms1intensities, false); - Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); + Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), + "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, + selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); var myMsDataFile = new FakeMsDataFile(Scans); @@ -69,21 +73,24 @@ public void TestDeconvolutionProteoformMultiChargeState(double selectedIonMz, in [Test] [TestCase("APSGGKK", "12-18-17_frac7_calib_ms1_663_665.mzML", 2)] - [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] - [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] + [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", + "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] + [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", + "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] public static void CheckGetMostAbundantObservedIsotopicMass(string peptide, string file, int charge) { Protein test1 = new Protein(peptide, "Accession"); DigestionParams d = new DigestionParams(); - PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); + PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, + CleavageSpecificity.None, "", 0, new Dictionary(), 0); double m = pw.MostAbundantMonoisotopicMass.ToMz(charge); string singleScan = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", file); - var reader = MsDataFileReader.GetDataFile(singleScan); + var reader = MsDataFileReader.GetDataFile(singleScan); reader.LoadAllStaticData(); List singlescan = reader.GetAllScansList(); - + MzSpectrum singlespec = singlescan[0].MassSpectrum; MzRange singleRange = new MzRange(singlespec.XArray.Min(), singlespec.XArray.Max()); int minAssumedChargeState = 1; @@ -92,13 +99,16 @@ public static void CheckGetMostAbundantObservedIsotopicMass(string peptide, stri double intensityRatioLimit = 3; //check assigned correctly - List lie2 = singlespec.Deconvolute(singleRange, minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); + List lie2 = singlespec.Deconvolute(singleRange, minAssumedChargeState, + maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); //check that if already assigned, skips assignment and just recalls same value - List lie3 = singlespec.Deconvolute(singleRange, minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); - Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); + List lie3 = singlespec.Deconvolute(singleRange, minAssumedChargeState, + maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); + Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), + lie3.Select(p => p.MostAbundantObservedIsotopicMass)); } #endregion @@ -106,15 +116,18 @@ public static void CheckGetMostAbundantObservedIsotopicMass(string peptide, stri #region Classic Deconvolution [Test] - [TestCase(586.2143122, 24, 41983672, 586.2)]//This is a lesser abundant charge state envelope at the low mz end - [TestCase(740.372202090153, 19, 108419280, 740.37)]//This is the most abundant charge state envelope - [TestCase(1081.385183, 13, 35454636, 1081.385)]//This is a lesser abundant charge state envelope at the high mz end - public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIonMz, int selectedIonChargeStateGuess, double selectedIonIntensity, double isolationMz) + [TestCase(586.2143122, 24, 41983672, 586.2)] //This is a lesser abundant charge state envelope at the low mz end + [TestCase(740.372202090153, 19, 108419280, 740.37)] //This is the most abundant charge state envelope + [TestCase(1081.385183, 13, 35454636, + 1081.385)] //This is a lesser abundant charge state envelope at the high mz end + public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIonMz, + int selectedIonChargeStateGuess, double selectedIonIntensity, double isolationMz) { MsDataScan[] Scans = new MsDataScan[1]; //txt file, not mgf, because it's an MS1. Most intense proteoform has mass of ~14037.9 Da - string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); + string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); string[] spectrumLines = File.ReadAllLines(Ms1SpectrumPath); @@ -131,7 +144,9 @@ public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIo MzSpectrum spectrum = new MzSpectrum(ms1mzs, ms1intensities, false); - Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); + Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), + "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, + selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); var myMsDataFile = new FakeMsDataFile(Scans); @@ -142,7 +157,8 @@ public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIo DeconvolutionParameters deconParameters = new ClassicDeconvolutionParameters(1, 60, 4, 3); List isolatedMasses = scan.GetIsolatedMassesAndCharges(scan, deconParameters).ToList(); - List isolatedMasses2 = scan.GetIsolatedMassesAndCharges(scan.MassSpectrum, deconParameters).ToList(); + List isolatedMasses2 = + scan.GetIsolatedMassesAndCharges(scan.MassSpectrum, deconParameters).ToList(); List monoIsotopicMasses = isolatedMasses.Select(m => m.MonoisotopicMass).ToList(); List monoIsotopicMasses2 = isolatedMasses2.Select(m => m.MonoisotopicMass).ToList(); @@ -155,13 +171,16 @@ public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIo [Test] [TestCase("APSGGKK", "12-18-17_frac7_calib_ms1_663_665.mzML", 2)] - [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] - [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] + [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", + "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] + [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", + "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptide, string file, int charge) { Protein test1 = new Protein(peptide, "Accession"); DigestionParams d = new DigestionParams(); - PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); + PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, + CleavageSpecificity.None, "", 0, new Dictionary(), 0); double m = pw.MostAbundantMonoisotopicMass.ToMz(charge); string singleScan = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", file); @@ -177,7 +196,8 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid double intensityRatioLimit = 3; DeconvolutionParameters deconParameters = - new ClassicDeconvolutionParameters(minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, + new ClassicDeconvolutionParameters(minAssumedChargeState, maxAssumedChargeState, + deconvolutionTolerancePpm, intensityRatioLimit); //check assigned correctly @@ -188,7 +208,8 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid //check that if already assigned, skips assignment and just recalls same value List lie3 = Deconvoluter.Deconvolute(singlespec, deconParameters, singleRange).ToList(); - Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); + Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), + lie3.Select(p => p.MostAbundantObservedIsotopicMass)); } #endregion @@ -226,21 +247,22 @@ public void TestNegativeModeClassicDeconvolution(double expectedMz, int expected public static void TestExampleNewDeconvolutionInDeconvoluter() { DeconvolutionParameters deconParams = new ExampleNewDeconvolutionParametersTemplate(1, 60); - var dataFile = MsDataFileReader.GetDataFile(Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "GUACUG_NegativeMode_Sliced.mzML")); + var dataFile = MsDataFileReader.GetDataFile(Path.Combine(TestContext.CurrentContext.TestDirectory, + "DataFiles", "GUACUG_NegativeMode_Sliced.mzML")); dataFile.InitiateDynamicConnection(); var scan = dataFile.GetOneBasedScanFromDynamicConnection(726); var spectrum = scan.MassSpectrum; dataFile.CloseDynamicConnection(); // test switch statements in Deconvoluter - Assert.Throws(() => Deconvoluter.Deconvolute(spectrum, deconParams)); - Assert.Throws(() => Deconvoluter.Deconvolute(scan, deconParams)); + NUnit.Framework.Assert.Throws(() => _ = Deconvoluter.Deconvolute(spectrum, deconParams).ToList()); + NUnit.Framework.Assert.Throws(() => _ =Deconvoluter.Deconvolute(scan, deconParams).ToList()); // test default exceptions in deconvoluter var badEnumValue = (DeconvolutionType)Int32.MaxValue; deconParams.GetType().GetProperty("DeconvolutionType")!.SetValue(deconParams, badEnumValue); - Assert.Throws(() => Deconvoluter.Deconvolute(spectrum, deconParams)); - Assert.Throws(() => Deconvoluter.Deconvolute(scan, deconParams)); + NUnit.Framework.Assert.Throws(() => _ = Deconvoluter.Deconvolute(spectrum, deconParams).ToList()); + NUnit.Framework.Assert.Throws(() => _ = Deconvoluter.Deconvolute(scan, deconParams).ToList()); } @@ -248,14 +270,15 @@ public static void TestExampleNewDeconvolutionInDeconvoluter() public static void Test_MsDataScan_GetIsolatedMassesAndCharges() { // get scan - string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "GUACUG_NegativeMode_Sliced.mzML"); + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", + "GUACUG_NegativeMode_Sliced.mzML"); var dataFile = MsDataFileReader.GetDataFile(filePath); var precursorScan = dataFile.GetOneBasedScan(1); var fragmentationScan = dataFile.GetOneBasedScan(2); // set up deconvolution DeconvolutionParameters deconParams = new ClassicDeconvolutionParameters(-10, -1, 20, 3, Polarity.Negative); - + // get isolated masses and charges on an MS1 scan. This means the isolation window is null. var ms1Result = precursorScan.GetIsolatedMassesAndCharges(precursorScan.MassSpectrum, deconParams).ToList(); Assert.That(ms1Result.Count, Is.EqualTo(0)); @@ -263,10 +286,132 @@ public static void Test_MsDataScan_GetIsolatedMassesAndCharges() Assert.That(ms1Result.Count, Is.EqualTo(0)); // get isolated masses and charges on an MS2 scan. This should work correctly - var ms2Result = fragmentationScan.GetIsolatedMassesAndCharges(precursorScan.MassSpectrum, deconParams).ToList(); + var ms2Result = fragmentationScan.GetIsolatedMassesAndCharges(precursorScan.MassSpectrum, deconParams) + .ToList(); Assert.That(ms2Result.Count, Is.EqualTo(1)); ms2Result = fragmentationScan.GetIsolatedMassesAndCharges(precursorScan, deconParams).ToList(); Assert.That(ms2Result.Count, Is.EqualTo(1)); } + + [Test] + public void NeutralMassSpectrum_Deconvolute_AllInRange() + { + // Arrange + var xArray = new[] { 260.774188159546, 391.660998843979 }; + var yArray = new[] { 1000.0, 1.0 }; + var charges = new[] { 1, 1 }; + var spectrum = new NeutralMassSpectrum(xArray, yArray, charges, false); + var deconvolutionParameters = new ClassicDeconvolutionParameters(1, 60, 20, 2); + var rangeToGetPeaksFrom = new MzRange(260.0, 400.0); + + // Act + var result = Deconvoluter.Deconvolute(spectrum, deconvolutionParameters, rangeToGetPeaksFrom).ToList(); + + // Assert + Assert.IsNotNull(result); + Assert.IsInstanceOf>(result); + Assert.AreEqual(2, result.Count()); + + for (int i = 0; i < result.Count(); i++) + { + Assert.That(result[i].MonoisotopicMass, Is.EqualTo(xArray[i])); + Assert.That(result[i].TotalIntensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Peaks.Count, Is.EqualTo(1)); + Assert.That(result[i].Peaks.First().mz, Is.EqualTo(xArray[i].ToMz(charges[i]))); + Assert.That(result[i].Peaks.First().intensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Charge, Is.EqualTo(charges[i])); + } + } + + + [Test] + public void NeutralMassSpectrum_Deconvolute_AllInRange_Charged() + { + // Arrange + var xArray = new[] { 260.774188159546, 391.660998843979 }; + var yArray = new[] { 1000.0, 1.0 }; + var charges = new[] { 3, 3 }; + var spectrum = new NeutralMassSpectrum(xArray, yArray, charges, false); + var deconvolutionParameters = new ClassicDeconvolutionParameters(1, 60, 20, 2); + var rangeToGetPeaksFrom = new MzRange(00, 200.0); + + // Act + var result = Deconvoluter.Deconvolute(spectrum, deconvolutionParameters, rangeToGetPeaksFrom).ToList(); + + // Assert + Assert.IsNotNull(result); + Assert.IsInstanceOf>(result); + Assert.AreEqual(2, result.Count()); + + for (int i = 0; i < result.Count(); i++) + { + Assert.That(result[i].MonoisotopicMass, Is.EqualTo(xArray[i])); + Assert.That(result[i].TotalIntensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Peaks.Count, Is.EqualTo(1)); + Assert.That(result[i].Peaks.First().mz, Is.EqualTo(xArray[i].ToMz(charges[i]))); + Assert.That(result[i].Peaks.First().intensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Charge, Is.EqualTo(charges[i])); + } + } + + [Test] + public void NeutralMassSpectrum_Deconvolute_SomeInRange() + { + // Arrange + var xArray = new[] { 260.774188159546, 391.660998843979 }; + var yArray = new[] { 1000.0, 1.0 }; + var charges = new[] { 1, 1 }; + var spectrum = new NeutralMassSpectrum(xArray, yArray, charges, false); + var deconvolutionParameters = new ClassicDeconvolutionParameters(1, 60, 20, 2); + var rangeToGetPeaksFrom = new MzRange(260.0, 300.0); + + // Act + var result = Deconvoluter.Deconvolute(spectrum, deconvolutionParameters, rangeToGetPeaksFrom).ToList(); + + // Assert + Assert.IsNotNull(result); + Assert.IsInstanceOf>(result); + Assert.AreEqual(1, result.Count()); + + for (int i = 0; i < result.Count(); i++) + { + Assert.That(result[i].MonoisotopicMass, Is.EqualTo(xArray[i])); + Assert.That(result[i].TotalIntensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Peaks.Count, Is.EqualTo(1)); + Assert.That(result[i].Peaks.First().mz, Is.EqualTo(xArray[i].ToMz(charges[i]))); + Assert.That(result[i].Peaks.First().intensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Charge, Is.EqualTo(charges[i])); + } + } + + [Test] + public void NeutralMassSpectrum_Deconvolute_SomeInRange_Charged() + { + // Arrange + var xArray = new[] { 260.774188159546, 391.660998843979 }; + var yArray = new[] { 1000.0, 1.0 }; + var charges = new[] { 1, 20 }; + var spectrum = new NeutralMassSpectrum(xArray, yArray, charges, false); + var deconvolutionParameters = new ClassicDeconvolutionParameters(1, 60, 20, 2); + var rangeToGetPeaksFrom = new MzRange(260.0, 300.0); + + // Act + var result = Deconvoluter.Deconvolute(spectrum, deconvolutionParameters, rangeToGetPeaksFrom).ToList(); + + // Assert + Assert.IsNotNull(result); + Assert.IsInstanceOf>(result); + Assert.AreEqual(1, result.Count()); + + for (int i = 0; i < result.Count(); i++) + { + Assert.That(result[i].MonoisotopicMass, Is.EqualTo(xArray[i])); + Assert.That(result[i].TotalIntensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Peaks.Count, Is.EqualTo(1)); + Assert.That(result[i].Peaks.First().mz, Is.EqualTo(xArray[i].ToMz(charges[i]))); + Assert.That(result[i].Peaks.First().intensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Charge, Is.EqualTo(charges[i])); + } + } } -} \ No newline at end of file +} diff --git a/mzLib/Test/TestDigestionMotif.cs b/mzLib/Test/TestDigestionMotif.cs index 91f8cf059..39b7a1704 100644 --- a/mzLib/Test/TestDigestionMotif.cs +++ b/mzLib/Test/TestDigestionMotif.cs @@ -1,5 +1,7 @@ using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Proteomics; using Proteomics.ProteolyticDigestion; using System; diff --git a/mzLib/Test/TestElementsAndIsotopes.cs b/mzLib/Test/TestElementsAndIsotopes.cs index c8a9342d0..a6f12b130 100644 --- a/mzLib/Test/TestElementsAndIsotopes.cs +++ b/mzLib/Test/TestElementsAndIsotopes.cs @@ -18,6 +18,7 @@ using Chemistry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System; using Stopwatch = System.Diagnostics.Stopwatch; diff --git a/mzLib/Test/TestFragments.cs b/mzLib/Test/TestFragments.cs index c377f3551..ea4a307c3 100644 --- a/mzLib/Test/TestFragments.cs +++ b/mzLib/Test/TestFragments.cs @@ -17,10 +17,11 @@ // License along with Proteomics. If not, see . using Chemistry; -using Easy.Common.Extensions; using MassSpectrometry; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Omics.Fragmentation; using Omics.Fragmentation.Peptide; using Proteomics; diff --git a/mzLib/Test/TestIsolation.cs b/mzLib/Test/TestIsolation.cs index 8026d1a64..d48772a02 100644 --- a/mzLib/Test/TestIsolation.cs +++ b/mzLib/Test/TestIsolation.cs @@ -19,6 +19,7 @@ using MassSpectrometry; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics.AminoAcidPolymer; using System; using System.Linq; diff --git a/mzLib/Test/TestLibrarySpectrum.cs b/mzLib/Test/TestLibrarySpectrum.cs index 57d496b2c..ba0817f3c 100644 --- a/mzLib/Test/TestLibrarySpectrum.cs +++ b/mzLib/Test/TestLibrarySpectrum.cs @@ -1,10 +1,6 @@ using NUnit.Framework; -using Proteomics.PSM; -using System; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; using Omics.Fragmentation; using Omics.SpectrumMatch; diff --git a/mzLib/Test/TestMassMzCalculations.cs b/mzLib/Test/TestMassMzCalculations.cs index e4e739390..4f1e88f5a 100644 --- a/mzLib/Test/TestMassMzCalculations.cs +++ b/mzLib/Test/TestMassMzCalculations.cs @@ -17,6 +17,7 @@ using Chemistry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System; using Stopwatch = System.Diagnostics.Stopwatch; diff --git a/mzLib/Test/TestModFits.cs b/mzLib/Test/TestModFits.cs index 3b0b2c225..48362d17f 100644 --- a/mzLib/Test/TestModFits.cs +++ b/mzLib/Test/TestModFits.cs @@ -1,4 +1,5 @@ using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics; using System; using Omics.Modifications; diff --git a/mzLib/Test/TestModifications.cs b/mzLib/Test/TestModifications.cs index 12f1566e3..021f08dc2 100644 --- a/mzLib/Test/TestModifications.cs +++ b/mzLib/Test/TestModifications.cs @@ -20,6 +20,8 @@ using MassSpectrometry; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Proteomics; using Proteomics.AminoAcidPolymer; using Proteomics.ProteolyticDigestion; diff --git a/mzLib/Test/TestMsFraggerCombinedResults.cs b/mzLib/Test/TestMsFraggerCombinedResults.cs new file mode 100644 index 000000000..bd5d8834d --- /dev/null +++ b/mzLib/Test/TestMsFraggerCombinedResults.cs @@ -0,0 +1,109 @@ +using NUnit.Framework; +using Readers; +using System; +using System.Collections.Generic; +using System.Linq; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.IO; +using TopDownProteomics; +using OxyPlot; +using System.Diagnostics.CodeAnalysis; + +namespace Test +{ + [ExcludeFromCodeCoverage] + internal class TestMsFraggerCombinedResults + { + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults")] + public void TestLoadResultsCount(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); + ms.LoadResults(); + + Assert.That(ms.AllPsmFiles.Count.Equals(2)); + Assert.That(ms.Results.Count.Equals(8)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults")] + public void TestLoadResults(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); + ms.LoadResults(); + + List results = ms.Results.Select(psm => psm.FileName).ToList(); + + Assert.That((results.Count(s => s.Contains("A_1"))).Equals(4)); + Assert.That((results.Count(s => s.Contains("A_2"))).Equals(4)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults")] + public void TestFileNameToFilePathWithParameter(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); + ms.LoadResults(); + + List fullFilePath = new List(); + // these local files are not actually accessed, they are fillers to test the method + string fullFilePath1 = @"E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.raw"; + string fullFilePath2 = @"E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.raw"; + fullFilePath.Add(fullFilePath1); + fullFilePath.Add(fullFilePath2); + + List results = ms.Results.Select(psm => psm.FileName).ToList(); + Dictionary allFiles = ms.FileNameToFilePath(fullFilePath); + List filePaths = ms.ExperimentAnnotations.Select(psm => psm.File).ToList(); + + foreach (var fileName in results) + { + Assert.That(allFiles.TryGetValue(fileName, out var output)); + Assert.That(filePaths.Contains(output)); + } + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults")] + public void TestFileNameToFilePathWithoutParameter(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); + ms.LoadResults(); + + List results = ms.Results.Select(psm => psm.FileName).ToList(); + Dictionary allFiles = ms.FileNameToFilePath(); + List filePaths = ms.ExperimentAnnotations.Select(psm => psm.File).ToList(); + + foreach (var fileName in results) + { + Assert.That(allFiles.TryGetValue(fileName, out var output)); + Assert.That(filePaths.Contains(output)); + } + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults\experiment_annotation.tsv")] + public void TestExperimentAnnotationFile(string path) + { + string fileToWrite = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults\copy_experiment_annotation.tsv"); + if (File.Exists(fileToWrite)) + { + File.Delete(fileToWrite); + } + + string fileToRead = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + + ExperimentAnnotationFile experimentAnnotation = FileReader.ReadFile(fileToRead); + + experimentAnnotation.WriteResults(fileToWrite); + Assert.That(File.Exists(fileToWrite)); + + File.Delete(fileToWrite); + } + } +} \ No newline at end of file diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index d0ee538d6..e2864c1e6 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -1,9 +1,5 @@ using NUnit.Framework; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using MzLibUtil; namespace Test @@ -28,10 +24,13 @@ public sealed class TestMzLibUtil [TestCase("penguin", "penguin")] [TestCase("penguin.jpg.gz", "penguin")] [TestCase("penguin.jpg.zip", "penguin")] + [TestCase("penguin.jpg.mzXML", "penguin.jpg")] public static void TestPeriodTolerantFilenameWithoutExtension(string filenameAndOrPath, string expectedResult) { string result = PeriodTolerantFilenameWithoutExtension.GetPeriodTolerantFilenameWithoutExtension(filenameAndOrPath); + string extensionResult = filenameAndOrPath.GetPeriodTolerantFilenameWithoutExtension(); Assert.AreEqual(expectedResult, result); + Assert.AreEqual(expectedResult, extensionResult); } } } diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index 549d504cf..64ce5dbc7 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -1,11 +1,14 @@ using Chemistry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Proteomics; using Proteomics.ProteolyticDigestion; using System; using System.Collections.Generic; using System.IO; using System.Linq; +using MzLibUtil; using Omics; using Omics.Digestion; using Omics.Fragmentation; @@ -53,7 +56,8 @@ public static void TestDifferentProteaseEquals() Assert.That(pep1.Parent.Equals(pep2.Parent)); Assert.That(!pep1.DigestionParams.DigestionAgent.Equals(pep2.DigestionParams.DigestionAgent)); Assert.That(!pep1.Equals(pep2)); - Assert.That(!pep1.GetHashCode().Equals(pep2.GetHashCode())); + // HashCode is only concerned with the full sequence, not the protease. Only the equals method is interested in the protease used + Assert.That(pep1.GetHashCode().Equals(pep2.GetHashCode())); } [Test] @@ -763,8 +767,8 @@ public static void TestReverseDecoyFromTarget() int testTargetHash = p.GetHashCode(); // Hash code corresponding to the decoy sequence, should be PairedTargetDecoyHash for target int testDecoyHash = reverse.GetHashCode(); - Assert.AreEqual(reverse.PairedTargetDecoyHash, testTargetHash); - Assert.AreEqual(p.PairedTargetDecoyHash, testDecoyHash); + Assert.AreEqual(reverse.PairedTargetDecoySequence.GetHashCode(), testTargetHash); + Assert.AreEqual(p.PairedTargetDecoySequence.GetHashCode(), testDecoyHash); Assert.AreEqual("EDITPEPK", reverse.BaseSequence); Assert.AreEqual(new int[] { 6, 5, 4, 3, 2, 1, 0, 7 }, newAminoAcidPositions); Assert.IsTrue(reverse.Protein.IsDecoy); @@ -839,8 +843,8 @@ public static void TestReverseDecoyFromTarget() int testMirrorTargetHash = p_tryp.GetHashCode(); // Hash code corresponding to the decoy sequence, should be PairedTargetDecoyHash for target int testMirrorDecoyHash = p_tryp_reverse.GetHashCode(); - Assert.AreEqual(testMirrorTargetHash, p_tryp_reverse.PairedTargetDecoyHash); - Assert.AreEqual(testMirrorDecoyHash, p_tryp.PairedTargetDecoyHash); + Assert.AreEqual(testMirrorTargetHash, p_tryp_reverse.PairedTargetDecoySequence.GetHashCode()); + Assert.AreEqual(testMirrorDecoyHash, p_tryp.PairedTargetDecoySequence.GetHashCode()); Assert.AreEqual("RVTRITV", p_tryp_reverse.BaseSequence); Assert.AreEqual(new int[] { 6, 5, 4, 3, 2, 1, 0 }, newAminoAcidPositions); Assert.IsTrue(p_tryp_reverse.AllModsOneIsNterminus.ContainsKey(1));//n-term acetyl @@ -869,8 +873,8 @@ public static void TestScrambledDecoyFromTarget() int testTargetHash = p.GetHashCode(); // Hash code corresponding to the decoy sequence, should be PairedTargetDecoyHash for target int testDecoyHash = testScrambled.GetHashCode(); - Assert.AreEqual(testScrambled.PairedTargetDecoyHash, testTargetHash); - Assert.AreEqual(p.PairedTargetDecoyHash, testDecoyHash); + Assert.AreEqual(testScrambled.PairedTargetDecoySequence.GetHashCode(), testTargetHash); + Assert.AreEqual(p.PairedTargetDecoySequence.GetHashCode(), testDecoyHash); Assert.AreEqual("IDEETPPK", testScrambled.BaseSequence); Assert.AreEqual(new int[] { 4, 5, 6, 1, 3, 0, 2, 7 }, newAminoAcidPositions); // Check n-term acetyl @@ -1178,5 +1182,96 @@ public static void TestPeptideWithSetModsNoParentProtein() Assert.AreEqual('-', last.NextAminoAcid); Assert.AreEqual('-', last.NextResidue); } + + [Test] + public static void TestIBioPolymerWithSetModsModificationFromFullSequence() + { + Dictionary un = new Dictionary(); + var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(TestContext.CurrentContext.TestDirectory, "PSI-MOD.obo2.xml")); + Dictionary formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); + List UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"), + formalChargesDictionary).ToList(); + List proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "cRAP_databaseGPTMD.xml"), + true, DecoyType.None, UniProtPtms, false, new string[] { "exclude_me" }, out un); + var allKnownModDict = UniProtPtms.ToDictionary(p => p.IdWithMotif, p => p); + var digestionParameters = new DigestionParams(maxModsForPeptides: 3); + + foreach (Protein p in proteins) + { + List digestedPeptides = + p.Digest(digestionParameters, [], [], null, null).ToList(); + // take the most modified peptide by base sequence and ensure all methods function properly + foreach (var targetPeptide in digestedPeptides + .Where(pep => pep.FullSequence.Contains('[')) + .GroupBy(pep => pep.BaseSequence) + .Select(pepGroup => pepGroup.MaxBy(pep => pep.AllModsOneIsNterminus.Count))) + { + var startResidue = targetPeptide.OneBasedStartResidue; + var endResidue = targetPeptide.OneBasedEndResidue; + + // Pull our expected modifications based upon parent protein object with a maximum value of DigestionParameters.MaxMods + // A bunch of logic to count the number of expected modifications based upon the xml database entries + int expectedModCount = 0; + foreach (var modDictEntry in p.OneBasedPossibleLocalizedModifications + .Where(mod => mod.Key >= startResidue && mod.Key <= endResidue)) + { + if (modDictEntry.Value.Count > 1) + { + var locRestrictions = modDictEntry.Value.Select(mod => mod.LocationRestriction).ToList(); + + if (locRestrictions.AllSame()) + { + if (locRestrictions.First() == "Anywhere.") + expectedModCount++; + else if (locRestrictions.First() == "N-terminal." && modDictEntry.Key == startResidue) + expectedModCount++; + } + else if (modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("Anywhere.") + && modDictEntry.Value.Select(mod => mod.LocationRestriction) + .Contains("N-terminal.")) + { + expectedModCount++; + if (modDictEntry.Key == startResidue) + expectedModCount++; + } + } + else + { + switch (modDictEntry.Value.First().LocationRestriction) + { + case "Anywhere.": + case "N-terminal." when modDictEntry.Key == startResidue: + expectedModCount++; + break; + } + } + } + + expectedModCount = Math.Min(expectedModCount, digestionParameters.MaxMods); + + var expectedModifications = p.OneBasedPossibleLocalizedModifications.Where(mod => + mod.Key >= startResidue && + mod.Key <= endResidue).SelectMany(mod => mod.Value).ToList(); + + // Parse modifications from PWSM and two IBioPolymerWithSetMods methods + var pwsmModDict = targetPeptide.AllModsOneIsNterminus; + var bpwsmModDict = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(targetPeptide.FullSequence, allKnownModDict); + var bpwsmModList = IBioPolymerWithSetMods.GetModificationsFromFullSequence(targetPeptide.FullSequence, allKnownModDict); + + // Ensure all methods are in agreement by modification count + Assert.AreEqual(pwsmModDict.Count, expectedModCount); + Assert.AreEqual(bpwsmModDict.Count, expectedModCount); + Assert.AreEqual(bpwsmModList.Count, expectedModCount); + + // Ensure all methods are in agreement by modification identify + foreach (var pwsmModification in pwsmModDict.Values) + Assert.Contains(pwsmModification, expectedModifications); + foreach (var pwsmModification in bpwsmModDict.Values) + Assert.Contains(pwsmModification, expectedModifications); + foreach (var pwsmModification in bpwsmModList) + Assert.Contains(pwsmModification, expectedModifications); + } + } + } } } \ No newline at end of file diff --git a/mzLib/Test/TestPeptides.cs b/mzLib/Test/TestPeptides.cs index c6a0193ce..bdd7778c8 100644 --- a/mzLib/Test/TestPeptides.cs +++ b/mzLib/Test/TestPeptides.cs @@ -19,6 +19,7 @@ using Chemistry; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics.AminoAcidPolymer; using Proteomics.ProteolyticDigestion; using System; diff --git a/mzLib/Test/TestProductMassesMightHaveDuplicates.cs b/mzLib/Test/TestProductMassesMightHaveDuplicates.cs index dd6702d16..af60acb98 100644 --- a/mzLib/Test/TestProductMassesMightHaveDuplicates.cs +++ b/mzLib/Test/TestProductMassesMightHaveDuplicates.cs @@ -1,6 +1,7 @@ using Chemistry; using MassSpectrometry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics; using Proteomics.ProteolyticDigestion; using System; diff --git a/mzLib/Test/TestProteinDatabase.cs b/mzLib/Test/TestProteinDatabase.cs index b03fd98c7..fffa0ef55 100644 --- a/mzLib/Test/TestProteinDatabase.cs +++ b/mzLib/Test/TestProteinDatabase.cs @@ -1,4 +1,6 @@ using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Proteomics; using System.Collections.Generic; using System.IO; diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index 2e430d67a..02cc3aed5 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -1,6 +1,8 @@ using Chemistry; using MassSpectrometry; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Proteomics; using Proteomics.AminoAcidPolymer; using Proteomics.ProteolyticDigestion; @@ -14,6 +16,8 @@ using UsefulProteomicsDatabases; using static Chemistry.PeriodicTable; using Stopwatch = System.Diagnostics.Stopwatch; +using MzLibUtil; +using System.Runtime.CompilerServices; namespace Test { @@ -357,6 +361,193 @@ public static void Test_ProteinDigest() Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence); } + [Test] + [TestCase("cRAP_databaseGPTMD.xml")] + [TestCase("uniprot_aifm1.fasta")] + public static void TestDecoyScramblingIsReproducible(string fileName) + { + // Load in proteins + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName); + DecoyType decoyType = DecoyType.Reverse; + List proteins1 = null; + List proteins2 = null; + if (fileName.Contains(".xml")) + { + proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, null, false, null, out unknownModifications); + } + else if (fileName.Contains(".fasta")) + { + proteins1 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out var unknownModifications); + proteins2 = ProteinDbLoader.LoadProteinFasta(dbPath, true, decoyType, false, out unknownModifications); + } + else + { + Assert.Fail("Unknown file type"); + } + + DigestionParams d = new DigestionParams( + maxMissedCleavages: 1, + minPeptideLength: 5, + initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + // Digest target proteins + var pepsToReplace = proteins1.Where(p => !p.IsDecoy) + .SelectMany(p => p.Digest(d, new List(), new List()).ToList()) + .Select(pep => pep.BaseSequence) + .ToHashSet(); + + // Ensure at least one decoy peptide from each protein is problematic and must be replaced + var singleDecoyPeptides = proteins1 + .Where(p => p.IsDecoy) + .Select(p => p.Digest(d, new List(), new List()).Skip(2).Take(1)) + .Select(pwsm => pwsm.First().BaseSequence) + .ToHashSet(); + + //modify targetpeptides in place + pepsToReplace.UnionWith(singleDecoyPeptides); + + // Scramble every decoy from db1 + List decoys1 = new(); + foreach (var protein in proteins1.Where(p => p.IsDecoy)) + { + decoys1.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace)); + } + // Scramble every decoy from db2 + List decoys2 = new(); + foreach (var protein in proteins2.Where(p => p.IsDecoy)) + { + decoys2.Add(Protein.ScrambleDecoyProteinSequence(protein, d, pepsToReplace)); + } + + // check are equivalent lists of proteins + Assert.AreEqual(decoys1.Count, decoys2.Count); + foreach (var decoyPair in decoys1.Concat(decoys2).GroupBy(p => p.Accession)) + { + Assert.AreEqual(2, decoyPair.Count()); + Assert.AreEqual(decoyPair.First().BaseSequence, decoyPair.Last().BaseSequence); + } + } + + [Test] + public static void TestDecoyScramblerReplacesPeptides() + { + DigestionParams d = new DigestionParams( + maxMissedCleavages: 1, + minPeptideLength: 5, + initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + + Protein target = new Protein("MEDEEKFVGYKYGVFK", "target"); + Protein decoy = new Protein("EEDEMKYGVFKFVGYK", "decoy"); + + var targetPep = target.Digest(d, new List(), new List()); + var decoyPep = decoy.Digest(d, new List(), new List()); + + HashSet targetPepSeqs = targetPep.Select(p => p.FullSequence).ToHashSet(); + var offendingDecoys = decoyPep.Where(p => targetPepSeqs.Contains(p.FullSequence)).Select(d => d.FullSequence).ToList(); + + Assert.AreEqual(2, offendingDecoys.Count); + + Protein scrambledDecoy = Protein.ScrambleDecoyProteinSequence(decoy, d, targetPepSeqs, offendingDecoys); + var scrambledPep = scrambledDecoy.Digest(d, new List(), new List()); + + Assert.AreEqual(decoyPep.Count(), scrambledPep.Count()); + Assert.IsFalse(scrambledPep.Any(p => offendingDecoys.Contains(p.FullSequence))); + + // Check to make sure that decoy generation also works in no offending sequences are passed in + scrambledDecoy = Protein.ScrambleDecoyProteinSequence(decoy, d, targetPepSeqs); + scrambledPep = scrambledDecoy.Digest(d, new List(), new List()); + + Assert.AreEqual(decoyPep.Count(), scrambledPep.Count()); + Assert.IsFalse(scrambledPep.Any(p => offendingDecoys.Contains(p.FullSequence))); + } + + [Test] + public static void TestDecoyScramblerModificationHandling() + { + DigestionParams d = new DigestionParams( + maxMissedCleavages: 1, + minPeptideLength: 5, + initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + + ModificationMotif.TryGetMotif("G", out ModificationMotif motifG); + ModificationMotif.TryGetMotif("F", out ModificationMotif motifF); + Modification modG = new Modification("myMod", null, "myModType", null, motifG, "Anywhere.", null, 10, null, null, null, null, null, null); + Modification modF = new Modification("myMod", null, "myModType", null, motifF, "Anywhere.", null, 10, null, null, null, null, null, null); + + IDictionary> modDictDecoy = new Dictionary> + { + {8, new List { modG } }, + {10, new List { modF } } + }; + + Protein target = new Protein("MEDEEKFVGYKYGVFK", "target"); //, oneBasedModifications: modDictTarget); + Protein decoy = new Protein("EEDEMKYGVFKFVGYK", "decoy", oneBasedModifications: modDictDecoy); + + var targetPep = target.Digest(d, new List(), new List()); + var decoyPep = decoy.Digest(d, new List(), new List()); + + HashSet targetPepSeqs = targetPep.Select(p => p.FullSequence).ToHashSet(); + var offendingDecoys = decoyPep.Where(p => targetPepSeqs.Contains(p.FullSequence)).Select(d => d.FullSequence).ToList(); + Protein scrambledDecoy = Protein.ScrambleDecoyProteinSequence(decoy, d, targetPepSeqs, offendingDecoys); + + var fIndex = scrambledDecoy.BaseSequence.IndexOf("F"); + var gIndex = scrambledDecoy.BaseSequence.IndexOf("G"); // We modified the first residue, so we don't need all locations, just the first + var fIndices = scrambledDecoy.BaseSequence.IndexOfAll("F"); + var gIndices = scrambledDecoy.BaseSequence.IndexOfAll("G"); + + Assert.AreEqual(2, gIndices.Count()); + Assert.AreEqual(2, fIndices.Count()); + Assert.AreEqual(fIndices.First(), fIndex); + + Assert.True(scrambledDecoy.OneBasedPossibleLocalizedModifications.ContainsKey(fIndex + 1)); + Assert.True(scrambledDecoy.OneBasedPossibleLocalizedModifications[fIndex+1].Contains(modF)); + + Assert.True(scrambledDecoy.OneBasedPossibleLocalizedModifications.ContainsKey(gIndex + 1)); + Assert.True(scrambledDecoy.OneBasedPossibleLocalizedModifications[gIndex + 1].Contains(modG)); + + Assert.AreEqual(scrambledDecoy.OneBasedPossibleLocalizedModifications.Count, 2); + } + + + + [Test, Timeout(5000)] + public static void TestDecoyScramblerNoInfiniteLoops() + { + DigestionParams d = new DigestionParams( + maxMissedCleavages: 0, + minPeptideLength: 3, + initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + + Protein target = new Protein("MEK", "target"); + Protein decoy = new Protein("EMK", "decoy"); + + var targetPep = target.Digest(d, new List(), new List()); + var decoyPep = decoy.Digest(d, new List(), new List()); + + HashSet targetPepSeqs = targetPep.Select(p => p.FullSequence).ToHashSet(); + + // We'll pretend that this is also a target sequence and can't be used as a decoy + HashSet offendingDecoys = new HashSet { "EMK" }; + + // You can't win in this scenario, there's no way to scramble that results in a different decoy + Protein scrambledDecoy = Protein.ScrambleDecoyProteinSequence(decoy, d, targetPepSeqs.Union(offendingDecoys).ToHashSet(), offendingDecoys); + var scrambledPep = scrambledDecoy.Digest(d, new List(), new List()); + + Assert.AreEqual(decoyPep.Count(), scrambledPep.Count()); + + d = new DigestionParams( + maxMissedCleavages: 1, + minPeptideLength: 3, + initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + + offendingDecoys = new HashSet { "KEK" }; + + var impossibleDecoy = new Protein("KEK", "target"); // This guy could crash the shuffling algorithm + scrambledDecoy = Protein.ScrambleDecoyProteinSequence(impossibleDecoy, d, offendingDecoys, offendingDecoys); + + Assert.AreEqual("KEK", scrambledDecoy.BaseSequence); + } + [Test] /// /// Tests that a PeptideWithSetModifications object can be parsed correctly from a string, with mod info diff --git a/mzLib/Test/TestProteinProperties.cs b/mzLib/Test/TestProteinProperties.cs index f22e71df9..6c8d866dd 100644 --- a/mzLib/Test/TestProteinProperties.cs +++ b/mzLib/Test/TestProteinProperties.cs @@ -1,4 +1,5 @@ using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics; using System; using System.Collections.Generic; diff --git a/mzLib/Test/TestPtmListLoader.cs b/mzLib/Test/TestPtmListLoader.cs index 3cdcf1412..dae556a02 100644 --- a/mzLib/Test/TestPtmListLoader.cs +++ b/mzLib/Test/TestPtmListLoader.cs @@ -1,6 +1,6 @@ using MzLibUtil; using NUnit.Framework; -using Proteomics; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System; using System.IO; using System.Linq; diff --git a/mzLib/Test/TestRangeAndTolerances.cs b/mzLib/Test/TestRangeAndTolerances.cs index dd248047c..8a08d94f0 100644 --- a/mzLib/Test/TestRangeAndTolerances.cs +++ b/mzLib/Test/TestRangeAndTolerances.cs @@ -18,6 +18,7 @@ using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System; using System.Linq; using Stopwatch = System.Diagnostics.Stopwatch; diff --git a/mzLib/Test/TestRetentionTimePrediction.cs b/mzLib/Test/TestRetentionTimePrediction.cs index f312c92a0..7fc048375 100644 --- a/mzLib/Test/TestRetentionTimePrediction.cs +++ b/mzLib/Test/TestRetentionTimePrediction.cs @@ -1,5 +1,5 @@ using NUnit.Framework; -using Proteomics; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Proteomics.ProteolyticDigestion; using Proteomics.RetentionTimePrediction; using System; diff --git a/mzLib/Test/TestSpectra.cs b/mzLib/Test/TestSpectra.cs index 8a8b0eb15..fec83ba8b 100644 --- a/mzLib/Test/TestSpectra.cs +++ b/mzLib/Test/TestSpectra.cs @@ -22,6 +22,8 @@ using System; using System.Collections.Generic; using System.Linq; +using Chemistry; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Stopwatch = System.Diagnostics.Stopwatch; namespace Test @@ -341,5 +343,111 @@ public void TestEqualsAndHashCode() Assert.That(!_mzSpectrumA.Equals(2)); Assert.That(!_mzSpectrumA.Equals((object)2)); } + + + [Test] + public void NeutralMassSpectrum_Constructor_ValidArguments_InitializesProperties() + { + double[] monoisotopicMasses = { 100.0, 200.0, 300.0 }; + double[] intensities = { 0.5, 0.8, 1.0 }; + int[] charges = { 1, 2, 3 }; + + var spectrum = new NeutralMassSpectrum(monoisotopicMasses, intensities, charges, true); + + Assert.That(monoisotopicMasses.Length, Is.EqualTo(spectrum.XArray.Length)); + Assert.That(intensities.Length, Is.EqualTo(spectrum.YArray.Length)); + Assert.That(charges.Length, Is.EqualTo(spectrum.Charges.Length)); + } + + [Test] + public void NeutralMassSpectrum_Constructor_InvalidArguments_ThrowsArgumentException() + { + double[] monoisotopicMasses = { 100.0, 200.0, 300.0 }; + double[] intensities = { 0.5, 0.8 }; + int[] charges = { 1, 2, 3 }; + bool shouldCopy = true; + + Assert.Throws(() => new NeutralMassSpectrum(monoisotopicMasses, intensities, charges, shouldCopy)); + } + + [Test] + public void NeutralMassSpectrum_MzPeak() + { + double[] monoisotopicMasses = { 100.0, 200.0, 300.0 }; + double[] intensities = { 0.5, 0.8, 1.0 }; + int[] charges = { 1, 2, 3 }; + var spectrum = new NeutralMassSpectrum(monoisotopicMasses, intensities, charges, true); + + + var peak = spectrum.Extract(50, 210).ToArray(); + Assert.That(peak.Length, Is.EqualTo(2)); + + for (int i = 0; i < peak.Length; i++) + { + double mono = monoisotopicMasses[i]; + int charge = charges[i]; + double intensity = intensities[i]; + double mz = mono.ToMz(charge); + + Assert.That(peak[i].Mz, Is.EqualTo(mz)); + Assert.That(peak[i].Intensity, Is.EqualTo(intensity)); + } + } + + [Test] + public void NeutralMassSpectrum_MzRange() + { + double[] monoisotopicMasses = { 100.0, 200.0, 300.0 }; + double[] intensities = { 0.5, 0.8, 1.0 }; + int[] charges = { 1, 2, 3 }; + var spectrum = new NeutralMassSpectrum(monoisotopicMasses, intensities, charges, true); + + + var peak = spectrum.Extract(50, 2100).ToArray(); + Assert.That(peak.Length, Is.EqualTo(3)); + var minPeak = peak.MinBy(p => p.Mz); + var maxPeak = peak.MaxBy(p => p.Mz); + + Assert.That(minPeak.Mz, Is.EqualTo(spectrum.Range.Minimum)); + Assert.That(minPeak.Mz, Is.EqualTo(spectrum.FirstX)); + Assert.That(maxPeak.Mz, Is.EqualTo(spectrum.Range.Maximum)); + Assert.That(maxPeak.Mz, Is.EqualTo(spectrum.LastX)); + + for (int i = 0; i < peak.Length; i++) + { + double mono = monoisotopicMasses[i]; + int charge = charges[i]; + double intensity = intensities[i]; + double mz = mono.ToMz(charge); + + Assert.That(peak[i].Mz, Is.EqualTo(mz)); + Assert.That(peak[i].Intensity, Is.EqualTo(intensity)); + } + } + + [Test] + public void NeutralMassSpectrum_Constructor_ValidArguments_InitializesCharges() + { + // Arrange + double[,] monoisotopicMassesIntensities = new double[,] { { 100.0, 200.0 }, { 300.0, 400.0 } }; + int[] charges = new int[] { 1, 2 }; + + // Act + var spectrum = new NeutralMassSpectrum(monoisotopicMassesIntensities, charges); + + // Assert + Assert.AreEqual(charges, spectrum.Charges); + } + + [Test] + public void NeutralMassSpectrum_Constructor2_InvalidArguments_ThrowsArgumentException() + { + // Arrange + double[,] monoisotopicMassesIntensities = new double[,] { { 100.0, 200.0 }, { 300.0, 400.0 } }; + int[] charges = new int[] { 1, 2, 3 }; + + // Act & Assert + Assert.Throws(() => new NeutralMassSpectrum(monoisotopicMassesIntensities, charges)); + } } } \ No newline at end of file diff --git a/mzLib/Test/TestSpectralSimilarity.cs b/mzLib/Test/TestSpectralSimilarity.cs index 9dbf79085..1d4b53819 100644 --- a/mzLib/Test/TestSpectralSimilarity.cs +++ b/mzLib/Test/TestSpectralSimilarity.cs @@ -2,6 +2,7 @@ using MassSpectrometry.MzSpectra; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using System; using System.Collections.Generic; @@ -19,7 +20,8 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() //Test all different similarity calculations MzSpectrum experimentalSpectrum = new(new double[] { 1, 2, 3, 4, 5 }, new double[] { 2, 4, 6, 8, 10 }, false); MzSpectrum theoreticalSpectrum = new(new double[] { 3, 4, 5, 6, 7 }, new double[] { 9, 7, 5, 3, 1 }, false); - SpectralSimilarity s = new(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true, true, 0); + // + SpectralSimilarity s = new(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true, 0); //mz pairs in tolerance are (3,3), (4,4), (5,5). Since we are using all peaks, we get 7 intensity pairs with 1,2,6 and 7 intensities being paired zero Assert.AreEqual(7, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.8).Within(0.01)); @@ -29,13 +31,13 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() Assert.That(s.PearsonsCorrelation(), Is.EqualTo(0.42).Within(0.01)); Assert.That(s.DotProduct(), Is.EqualTo(0.17).Within(0.01)); Assert.That(s.SearleSimilarity(), Is.EqualTo(2.4391).Within(0.01)); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); Assert.That(s.SpectralEntropy(), Is.EqualTo(0.79).Within(0.01)); //Test all normalization schemes experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 2, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 1 }, new double[] { 2 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, true, true,0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, true,0); //mz pairs in tolerance are (1,1). Since we are creating pairs for all experimental peaks we get additional pairs for 2 and 3 with zero intensities Assert.AreEqual(3, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.27).Within(0.01)); @@ -45,7 +47,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() Assert.That(s.PearsonsCorrelation(), Is.EqualTo(-0.87).Within(0.01)); Assert.That(s.DotProduct(), Is.EqualTo(0.33).Within(0.01)); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true,true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.27).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.17).Within(0.01)); Assert.That(s.EuclideanDistance(), Is.EqualTo(-.03).Within(0.01)); @@ -53,7 +55,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() Assert.That(s.PearsonsCorrelation(), Is.EqualTo(-0.87).Within(0.01)); Assert.That(s.DotProduct(), Is.EqualTo(0.17).Within(0.01)); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true,true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true, 0); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.41).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.27).Within(0.01)); Assert.That(s.EuclideanDistance(), Is.EqualTo(0.07).Within(0.01)); @@ -61,7 +63,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() Assert.That(s.PearsonsCorrelation(), Is.EqualTo(-0.90).Within(0.01)); Assert.That(s.DotProduct(), Is.EqualTo(0.24).Within(0.01)); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true,true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, 0); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.27).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.17).Within(0.01)); Assert.That(s.EuclideanDistance(), Is.EqualTo(-6.21).Within(0.01)); @@ -75,7 +77,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() Assert.Throws(() => { - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); }, "Empty YArray in spectrum."); //We should have any zero intensity YArrays but just to be sure @@ -84,14 +86,14 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() Assert.Throws(() => { - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); }, "Spectrum has no intensity."); //What happens when all intensity pairs include a zero experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 2, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 4 }, new double[] { 2 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true,true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); //There are no mz pairs in tolerance. But we automatically get a pair for 4 because it is a theoretical mz. And we get pairs for 1,2,3 with zero intensities because we are using all experimental peaks. Assert.AreEqual(4, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0).Within(0.01)); @@ -104,7 +106,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 0, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 1, 2, 3, 4 }, new double[] { 2, 0, 2, 2 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true,true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); //mz pairs in tolerance are (1,1), (2,2), (3,3). We also automatically get a pair for 4 because it is a theoretical mz. Assert.AreEqual(4, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.48).Within(0.01)); @@ -117,7 +119,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() //Test what happens when all intensity pairs include 1 zero experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 0, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 4, 5 }, new double[] { 2, 0 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true,true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); //There are no mz pairs in tolerance. We eliminate the peak at 3 because it has zero intensity. We also remove the theoretical pair at 5 because it has zero intensity. //We get 3 pairs for 2, 3 and 4. Assert.AreEqual(3, s.IntensityPairs.Count); @@ -132,14 +134,14 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3, 4 }, new double[] { 1, 2, 3, 4 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 1.000011, 1.99997, 3.000031, 3.99995 }, new double[] { 1, 2, 3, 4 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true,true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); //The ppm difference between the 4 closest pairs are 11, 15,10.3 and 12.5. These are all beyond the 10ppm tolerance that is allowed. Therefore we get 8 intensity pairs with all intensities being paired zero Assert.AreEqual(8, s.IntensityPairs.Count); //Test alternate constructor experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 2, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 1 }, new double[] { 2 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, true, true,0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, true,0); //mz pairs in tolerance are (1,1). Since we are using all peaks, we get 3 intensity pairs with 2 and 3 intensities being paired z Assert.AreEqual(3, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.27).Within(0.01)); @@ -152,7 +154,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() //Test alternate constructor only library peaks. Since library has one peak, and primary has three peaks, we get only one intensity pair experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 2, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 1 }, new double[] { 2 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, false, true,0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, false,0); //mz pairs in tolerance are (1,1). Since we are NOT using all peaks, we get only 1 intensity pair Assert.AreEqual(1, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(1.0).Within(0.01)); @@ -165,7 +167,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() //Test cosine similarity when there are no peaks from spectrum one matching spectrum 2 experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 2, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 4,6,8 }, new double[] { 2,4,6 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, false, true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, false, 0); //There are no mz pairs in tolerance. But we keep all three theoretical peaks so we get three intesity pairs with all intensities being paired zero Assert.AreEqual(3, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0).Within(0.01)); @@ -174,7 +176,7 @@ public void TestAllSpectrumSimilaritiesWithoutMzFilter() //Test SearleSimilarity with both spectra are identical experimentalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 2, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 1, 2, 3 }, new double[] { 2, 4, 6 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, false, true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum.XArray, theoreticalSpectrum.YArray, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, false, 0); //there are 3 mz pairs in tolerance. Assert.AreEqual(s.SearleSimilarity(), double.MaxValue); } @@ -187,7 +189,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() MzSpectrum theoreticalSpectrum = new MzSpectrum(new double[] { 200, 300, 500, 600 ,800}, new double[] { 9, 7, 5, 3, 1 }, false); //Test when using all peaks of primary(experimental) and secondary(theoretical) spectra (bool all experimental peaks is true) and mz cut off is 0 (no cut off) //we are keeping everything and there are 8 mz peaks with more than zero intnsity so we get 8 intensity pairs - SpectralSimilarity s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true,true, 0); + SpectralSimilarity s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true, 0); Assert.AreEqual(8, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.68).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.48).Within(0.01)); @@ -197,7 +199,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() Assert.That(s.DotProduct(), Is.EqualTo(0.13).Within(0.01)); //Test when using all peaks of primary(experimental) and secondary(theoretical) spectra (bool all experimental peaks is true) and mz cut off is 300 (default cut off) - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true, true); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true); //similary to above but we remove peaks below 300. That leaves 6 intensity pairs. Assert.AreEqual(6, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.70).Within(0.01)); @@ -210,7 +212,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() //Test when not using all peaks of primary(experimental) spectra (bool all experimental peaks is false) and mz cut off is is 0 (no cut off) //experimental xArray 100, 200, 300, 400, 500, 600, 700 and theoretical xArray 200, 300, 500, 600, 800. So, 200, 300, 500, 600 are common. We keep 800 from the theoretical spectrum //because the default is to keep all theoretical peaks. - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, false, true, 0); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, false, 0); Assert.AreEqual(5, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.903).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.718).Within(0.01)); @@ -222,7 +224,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() //Test when not using all peaks of primary(experimental) spectra (bool all experimental peaks is false) and mz cut off is is 300 (default cut off) //primary xArray 100, 200, 300, 400, 500, 600, 700 and secondary xArray 200, 300, 500, 600, 800. So, 200, 300, 500, 600 are common. But with 300 cut off, only 300, 500, 600 are common //we keep 800 because it is a theoretical peak. - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, false, true); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, false); Assert.AreEqual(4, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.924).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.75).Within(0.01)); @@ -234,7 +236,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() //Test all different similarity calculations experimentalSpectrum = new MzSpectrum(new double[] { 100, 200, 300, 400, 500 }, new double[] { 2, 4, 6, 8, 10 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 300, 400, 500, 600, 700 }, new double[] { 9, 7, 5, 3, 1 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, false, true); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, false); //all experimental peaks is false and there is no mz cut off. //Therefore we get a intensity pair for each theoretical mz for a total of 5. Assert.AreEqual(5, s.IntensityPairs.Count); @@ -248,7 +250,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() //Test all normalization schemes experimentalSpectrum = new MzSpectrum(new double[] { 1000, 2000, 3000 }, new double[] { 2, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 1000 }, new double[] { 2 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, true, true); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.MostAbundantPeak, ppmTolerance, true); //there is one mz pair in tolerance (1000,1000). Since we are using all experimental peaks, we get additional pairs for 2000 and 3000 with zero intensities Assert.AreEqual(3, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.267).Within(0.01)); @@ -258,7 +260,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() Assert.That(s.PearsonsCorrelation(), Is.EqualTo(-0.866).Within(0.01)); Assert.That(s.DotProduct(), Is.EqualTo(0.333).Within(0.01)); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, true); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.27).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.17).Within(0.01)); Assert.That(s.EuclideanDistance(), Is.EqualTo(-.03).Within(0.01)); @@ -266,7 +268,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() Assert.That(s.PearsonsCorrelation(), Is.EqualTo(-0.87).Within(0.01)); Assert.That(s.DotProduct(), Is.EqualTo(0.17).Within(0.01)); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true, true); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SquareRootSpectrumSum, ppmTolerance, true); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.41).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.27).Within(0.01)); Assert.That(s.EuclideanDistance(), Is.EqualTo(0.07).Within(0.01)); @@ -274,7 +276,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() Assert.That(s.PearsonsCorrelation(), Is.EqualTo(-0.90).Within(0.01)); Assert.That(s.DotProduct(), Is.EqualTo(0.24).Within(0.01)); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, true); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true); Assert.That(s.CosineSimilarity(), Is.EqualTo(0.27).Within(0.01)); Assert.That(s.SpectralContrastAngle(), Is.EqualTo(0.17).Within(0.01)); Assert.That(s.EuclideanDistance(), Is.EqualTo(-6.21).Within(0.01)); @@ -286,7 +288,7 @@ public void TestAllSpectrumSimilaritiesWithDefaultedMzFilter() //What happens when all intensity pairs include a zero experimentalSpectrum = new MzSpectrum(new double[] { 100, 200, 300 }, new double[] { 2, 4, 6 }, false); theoreticalSpectrum = new MzSpectrum(new double[] { 400, 500 }, new double[] { 2, 4 }, false); - s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, true); + s = new SpectralSimilarity(experimentalSpectrum, theoreticalSpectrum, SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true); //There are no mz pairs in tolerence. But, we are keeping all experimental peaks. With the theoretical peaks with mz >= 300. That gives us 3 intensity pairs Assert.AreEqual(3, s.IntensityPairs.Count); Assert.That(s.CosineSimilarity(), Is.EqualTo(0).Within(0.01)); @@ -393,7 +395,7 @@ public void TestKullbackLeiblerDivergence() double[] q_YArray = new double[] { 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0 }; //Test when using all peaks of primary(experimental) and secondary(theoretical) spectra(bool allpeaks is true) and mz cut off is 0(no cut off) - SpectralSimilarity s = new(p_XArray, p_YArray, q_XArray, q_YArray, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, true, 0); + SpectralSimilarity s = new(p_XArray, p_YArray, q_XArray, q_YArray, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, 0); Assert.That(s.KullbackLeiblerDivergence_P_Q(), Is.EqualTo(0.0853).Within(0.001)); // ignore negative intensity @@ -403,7 +405,7 @@ public void TestKullbackLeiblerDivergence() q_YArray = new double[] { 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0 }; //Test when using all peaks of primary(experimental) and secondary(theoretical) spectra (bool allpeaks is true) and mz cut off is 0 (no cut off) - s = new(p_XArray, p_YArray, q_XArray, q_YArray, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, true, 0); + s = new(p_XArray, p_YArray, q_XArray, q_YArray, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, 0); Assert.That(s.KullbackLeiblerDivergence_P_Q(), Is.EqualTo(0.0853).Within(0.001)); Assert.That(s.KullbackLeiblerDivergence_P_Q(), Is.EqualTo(s.KullbackLeiblerDivergence_P_Q(correctionConstant: 0)).Within(0.001)); @@ -414,7 +416,7 @@ public void TestKullbackLeiblerDivergence() q_YArray = new double[] { 1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0 }; //Test when using all peaks of primary(experimental) and secondary(theoretical) spectra (bool allpeaks is true) and mz cut off is 0 (no cut off) - s = new(p_XArray, p_YArray, q_XArray, q_YArray, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, true, 0); + s = new(p_XArray, p_YArray, q_XArray, q_YArray, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, 0); Assert.That(s.KullbackLeiblerDivergence_P_Q(), Is.EqualTo(0.0853).Within(0.001)); // correct for 0 intensity values @@ -425,7 +427,7 @@ public void TestKullbackLeiblerDivergence() //Test when using all peaks of primary(experimental) and secondary(theoretical) spectra (bool all experimental peaks is true) and mz cut off is 0 (no cut off) s = new(p_XArray, p_YArray, q_XArray, q_YArray, - SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, true, 0); + SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); // With correction, this should increase divergence for missing peaks Assert.That(s.KullbackLeiblerDivergence_P_Q(), Is.EqualTo(3.467).Within(0.01)); Assert.That(s.KullbackLeiblerDivergence_P_Q() > s.KullbackLeiblerDivergence_P_Q(correctionConstant: 0)); @@ -438,7 +440,7 @@ public void TestKullbackLeiblerDivergence() //Test when using all peaks of primary(experimental) and secondary(theoretical) spectra (bool allpeaks is true) and mz cut off is 0 (no cut off) s = new(p_XArray, p_YArray, q_XArray, q_YArray, - SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, true, 0); + SpectralSimilarity.SpectrumNormalizationScheme.SpectrumSum, ppmTolerance, true, 0); // With correction, this should increase divergence for missing peaks Assert.That(s.KullbackLeiblerDivergence_P_Q(), Is.GreaterThan(3)); Assert.That(s.KullbackLeiblerDivergence_P_Q() > s.KullbackLeiblerDivergence_P_Q(correctionConstant: 0)); @@ -449,7 +451,7 @@ public void TestKullbackLeiblerDivergence() q_XArray = new double[] { 1, 2, 3, 4 }; q_YArray = new double[] { 0.0 / 3.0, 0.0 / 25.0, 1.0 / 3.0, 8.0 / 25.0 }; - s = new(p_XArray, p_YArray, q_XArray, q_YArray, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, true, 0); + s = new(p_XArray, p_YArray, q_XArray, q_YArray, SpectralSimilarity.SpectrumNormalizationScheme.Unnormalized, ppmTolerance, true, 0); // With correction, this should increase divergence for missing peaks Assert.That(s.KullbackLeiblerDivergence_P_Q() == null); diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.fasta b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta new file mode 100644 index 000000000..c222589c1 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta @@ -0,0 +1,2 @@ +>id:2|Name:20mer1|SOterm:20mer1|Type:tRNA|Subtype:Ala|Feature:VGC|Cellular_Localization:freezer|Species:standard +GUACUGCCUCUAGUGAAGCA \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz new file mode 100644 index 000000000..2fe54f9ab Binary files /dev/null and b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz differ diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.xml b/mzLib/Test/Transcriptomics/TestData/20mer1.xml new file mode 100644 index 000000000..6f17d6f3d --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestData/20mer1.xml @@ -0,0 +1,17 @@ + + + + 20mer1 + 20mer1 + + + 20mer1 + + + + + standard + + GUACUGCCUCUAGUGAAGCA + + \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz b/mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz new file mode 100644 index 000000000..19dac16bf Binary files /dev/null and b/mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz differ diff --git a/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta new file mode 100644 index 000000000..18802a82a --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta @@ -0,0 +1,10 @@ +>id:1|Name:tdbR00000010|SOterm:SO:0000254|Type:tRNA|Subtype:Ala|Feature:VGC|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA +>id:2|Name:tdbR00000008|SOterm:SO:0000254|Type:tRNA|Subtype:Ala|Feature:GGC|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GGGGCUAUAGCUCAGCUGGGAGAGCGCUUGCAUGGCAUGCAAGAGGUCAGCGGUUCGAUCCCGCUUAGCUCCACCA +>id:3|Name:tdbR00000356|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:ICG|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCAUCCGUAGCUCAGCUGGAUAGAGUACUCGGCUACGAACCGAGCGGUCGGAGGUUCGAAUCCUCCCGGAUGCACCA +>id:4|Name:tdbR00000359|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:CCG|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCGCCCGUAGCUCAGCUGGAUAGAGCGCUGCCCUCCGGAGGCAGAGGUCUCAGGUUCGAAUCCUGUCGGGCGCGCCA +>id:5|Name:tdbR00000358|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:UCU|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCGCCCUUAGCUCAGUUGGAUAGAGCAACGACCUUCUAAGUCGUGGGCCGCAGGUUCGAAUCCUGCAGGGCGCGCCA diff --git a/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz new file mode 100644 index 000000000..11ab87ef2 Binary files /dev/null and b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz differ diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs new file mode 100644 index 000000000..e1ef6af90 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs @@ -0,0 +1,171 @@ +using NUnit.Framework; +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using UsefulProteomicsDatabases.Transcriptomics; +using UsefulProteomicsDatabases; +using Transcriptomics; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestDbLoader + { + public static string ModomicsUnmodifedFastaPath => Path.Combine(TestContext.CurrentContext.TestDirectory, + "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta"); + + /// + /// Detect the headertype of the test cases + /// + private static IEnumerable<(string, RnaFastaHeaderType)> DetectHeaderTestCases => + new List<(string, RnaFastaHeaderType)> + { + (Path.Combine(TestContext.CurrentContext.TestDirectory, "DoubleProtease.tsv"), RnaFastaHeaderType.Unknown), + (ModomicsUnmodifedFastaPath, RnaFastaHeaderType.Modomics), + (Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta"), RnaFastaHeaderType.Modomics), + + }; + + /// + /// Test the correctness of checking headertype + /// + /// + [Test] + [TestCaseSource(nameof(DetectHeaderTestCases))] + public static void TestDetectHeaderType((string dbPath, RnaFastaHeaderType headerType) testData) + { + string line = File.ReadLines(testData.dbPath).First(); + if (char.IsDigit(line.First())) + { + line = File.ReadLines(testData.dbPath).Skip(1).First(); + } + var type = RnaDbLoader.DetectRnaFastaHeaderType(line); + Assert.That(testData.headerType, Is.EqualTo(type)); + } + + + [Test] + [TestCase("ModomicsUnmodifiedTrimmed.fasta")] + [TestCase("ModomicsUnmodifiedTrimmed.fasta.gz")] + public static void TestModomicsUnmodifiedFasta(string databaseFileName) + { + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", + databaseFileName); + var oligos = RnaDbLoader.LoadRnaFasta(dbPath, true, DecoyType.None, false, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(5)); + Assert.That(oligos.First().BaseSequence, + Is.EqualTo("GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA")); + Assert.That(oligos.First().Name, Is.EqualTo("tdbR00000010")); + Assert.That(oligos.First().Accession, Is.EqualTo("SO:0000254")); + Assert.That(oligos.First().Organism, Is.EqualTo("Escherichia coli")); + Assert.That(oligos.First().DatabaseFilePath, Is.EqualTo(dbPath)); + Assert.That(oligos.First().IsContaminant, Is.False); + Assert.That(oligos.First().IsDecoy, Is.False); + Assert.That(oligos.First().AdditionalDatabaseFields!.Count, Is.EqualTo(5)); + Assert.That(oligos.First().AdditionalDatabaseFields!["Id"], Is.EqualTo("1")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Type"], Is.EqualTo("tRNA")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Subtype"], Is.EqualTo("Ala")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Feature"], Is.EqualTo("VGC")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Cellular Localization"], Is.EqualTo("prokaryotic cytosol")); + } + + [Test] + public static void TestContaminantFollowsThrough() + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, true, DecoyType.None, true, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(5)); + Assert.That(oligos.First().BaseSequence, + Is.EqualTo("GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA")); + Assert.That(oligos.All(p => p.IsContaminant)); + Assert.That(oligos.All(p => !p.IsDecoy)); + } + + [Test] + public static void TestNotGeneratingTargetsOrDecoys() + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, false, DecoyType.None, true, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(0)); + } + + [Test] + public static void TestXmlWriterReader() + { + var rna = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, true, DecoyType.None, false, out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + + var modString = "ID Methylation\r\nMT Biological\r\nPP Anywhere.\r\nTG G\r\nCF C1H2\r\n" + @"//"; + var methylG = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).First(); + + Dictionary>> mods = new Dictionary>>(); + mods.Add("SO:0000254", new HashSet>() + { + new Tuple(1, methylG), + new Tuple(3, methylG) + }); + + string outpath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.xml"); + + var xml = ProteinDbWriter.WriteXmlDatabase(mods, rna, outpath); + var temp = RnaDbLoader.LoadRnaXML(outpath, true, DecoyType.None, false, + new List() { methylG }, new List(), out var unknownMods); + + Assert.That(unknownMods.Count, Is.EqualTo(0)); + Assert.That(temp.Count, Is.EqualTo(5)); + var first = temp.First(); + var loadedMods = first.OneBasedPossibleLocalizedModifications; + Assert.That(loadedMods.Count, Is.EqualTo(2)); + Assert.That(loadedMods[1].Count, Is.EqualTo(1)); + Assert.That(loadedMods[3].Count, Is.EqualTo(1)); + Assert.That(loadedMods[1].First().IdWithMotif, Is.EqualTo(methylG.IdWithMotif)); + Assert.That(loadedMods[3].First().IdWithMotif, Is.EqualTo(methylG.IdWithMotif)); + } + + [Test] + [TestCase("ATCG", "AUCG", true)] + [TestCase("ATCG", "UAGC", false)] + [TestCase("ATCGZ", "AUCGZ", true)] + [TestCase("ATCGZ", "UAGCZ", false)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACT", "AUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACU", true)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACT", "UAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGA", false)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACT", "AUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACU", true)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACT", "UAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGA", false)] + public static void TestTranscribe(string input, string expected, bool isCodingStrand) + { + Assert.That(input.Transcribe(isCodingStrand), Is.EqualTo(expected)); + } + + [Test] + [TestCase("20mer1.fasta")] + [TestCase("20mer1.fasta.gz")] + [TestCase("20mer1.xml")] + [TestCase("20mer1.xml.gz")] + public static void TestDbReadingDifferentExtensions(string databaseFileName) + { + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", + databaseFileName); + + List rna; + if (dbPath.Contains("fasta")) + rna = RnaDbLoader.LoadRnaFasta(dbPath, true, DecoyType.None, false, + out var errors); + else + rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.None, false, + new List(), new List(), out _); + + Assert.That(rna.Count, Is.EqualTo(1)); + Assert.That(rna.First().BaseSequence, Is.EqualTo("GUACUGCCUCUAGUGAAGCA")); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs new file mode 100644 index 000000000..acf6cbff9 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs @@ -0,0 +1,280 @@ +using NUnit.Framework; +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework.Interfaces; +using Transcriptomics; +using Transcriptomics.Digestion; +using UsefulProteomicsDatabases.Transcriptomics; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestDecoyGeneration + { + public static string ModomicsUnmodifiedFastaPath => TestDbLoader.ModomicsUnmodifedFastaPath; + + [Test] + public static void TestReverseDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUUCUG"), + new RNA("GUGCUA"), + }; + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Reverse, 1); + Assert.That(decoys.Count, Is.EqualTo(2)); + Assert.That(decoys[0].BaseSequence, Is.EqualTo("UCUUGG")); + Assert.That(decoys[1].BaseSequence, Is.EqualTo("UCGUGA")); + + var example = oligos.First(); + Assert.That(decoys.All(p => !p.IsContaminant)); + Assert.That(decoys.All(p => p.IsDecoy)); + Assert.That(decoys.All(p => p.DatabaseFilePath == example.DatabaseFilePath)); + Assert.That(decoys.All(p => p.Organism == example.Organism)); + Assert.That(decoys.All(p => p.AdditionalDatabaseFields == example.AdditionalDatabaseFields)); + Assert.That(decoys.All(p => p.Accession == example.Accession)); + Assert.That(decoys.All(p => p.Name == example.Name)); + Assert.That(decoys.All(p => p.Length == example.Length)); + Assert.That(decoys.All(p => Equals(p.FivePrimeTerminus, example.FivePrimeTerminus))); + Assert.That(decoys.All(p => Equals(p.ThreePrimeTerminus, example.ThreePrimeTerminus))); + Assert.That(decoys.All(p => p.OneBasedPossibleLocalizedModifications.Count == example.OneBasedPossibleLocalizedModifications.Count)); + } + + [Test] + [TestCase("GUACUG", 1, "UCAUGG", 5)] + [TestCase("GUACUA", 2, "UCAUGA", 4)] + [TestCase("GUACUA", 3, "UCAUGA", 3)] + [TestCase("GUACUA", 4, "UCAUGA", 2)] + [TestCase("GUCCAA", 5, "ACCUGA", 1)] + [TestCase("GUUCUA", 6, "UCUUGA", 6)] + public static void TestReverseDecoy_SimpleWithMods(string rnaSequence, int modPosition, string expectedDecoySequence, int expectedDecoyModPosition) + { + var mod = new Modification(); + var oligos = new List() + { + new RNA(rnaSequence, null, null, + new Dictionary>() + { { modPosition, new List() { mod } } }), + }; + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Reverse, 1); + Assert.That(decoys.Count, Is.EqualTo(1)); + + var decoy = decoys.First(); + var originalRna = oligos.First(); + Assert.That(decoy.BaseSequence, Is.EqualTo(expectedDecoySequence)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(1)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Key, Is.EqualTo(expectedDecoyModPosition)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Value.Count, Is.EqualTo(1)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Value.First(), Is.EqualTo(mod)); + Assert.That(decoy.Name, Is.EqualTo(originalRna.Name)); + Assert.That(decoy.Accession, Is.EqualTo(originalRna.Accession)); + Assert.That(decoy.Organism, Is.EqualTo(originalRna.Organism)); + Assert.That(decoy.DatabaseFilePath, Is.EqualTo(originalRna.DatabaseFilePath)); + Assert.That(decoy.IsContaminant, Is.EqualTo(originalRna.IsContaminant)); + Assert.That(decoy.IsDecoy, Is.True); + Assert.That(decoy.AdditionalDatabaseFields, Is.EqualTo(originalRna.AdditionalDatabaseFields)); + Assert.That(decoy.FivePrimeTerminus, Is.EqualTo(originalRna.FivePrimeTerminus)); + Assert.That(decoy.ThreePrimeTerminus, Is.EqualTo(originalRna.ThreePrimeTerminus)); + } + + [Test] + public void TestReverseDecoy_FromDatabase() + { + int numSequences = 5; + Dictionary expectedSequences = new Dictionary() + { + { "tdbR00000010", "CCACCUCGAUACGCCCUAGCUUGGCGUCUGGAGGACGCACGUUUCGUCCGCGAGAGGGUCGACUCGAUAUCGGGGA"}, + { "tdbR00000008", "CCACCUCGAUUCGCCCUAGCUUGGCGACUGGAGAACGUACGGUACGUUCGCGAGAGGGUCGACUCGAUAUCGGGGA"}, + { "tdbR00000356", "CCACGUAGGCCCUCCUAAGCUUGGAGGCUGGCGAGCCAAGCAUCGGCUCAUGAGAUAGGUCGACUCGAUGCCUACGA"}, + { "tdbR00000359", "CCGCGCGGGCUGUCCUAAGCUUGGACUCUGGAGACGGAGGCCUCCCGUCGCGAGAUAGGUCGACUCGAUGCCCGCGA"}, + { "tdbR00000358", "CCGCGCGGGACGUCCUAAGCUUGGACGCCGGGUGCUGAAUCUUCCAGCAACGAGAUAGGUUGACUCGAUUCCCGCGA"}, + }; + + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Reverse, false, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(numSequences * 2)); + Assert.That(oligos.Count(p => p.IsDecoy), Is.EqualTo(numSequences)); + Assert.That(oligos.Count(p => !p.IsDecoy), Is.EqualTo(numSequences)); + + foreach (var targetDecoyGroup in oligos.GroupBy(p => p.Name)) + { + Assert.That(targetDecoyGroup.Count(), Is.EqualTo(2)); + var target = targetDecoyGroup.First(p => !p.IsDecoy); + var decoy = targetDecoyGroup.First(p => p.IsDecoy); + var expectedSequence = expectedSequences[target.Name]; + + Assert.That(target.FivePrimeTerminus, Is.EqualTo(decoy.FivePrimeTerminus)); + Assert.That(target.ThreePrimeTerminus, Is.EqualTo(decoy.ThreePrimeTerminus)); + Assert.That(target.AdditionalDatabaseFields, Is.EqualTo(decoy.AdditionalDatabaseFields)); + Assert.That(target.IsContaminant, Is.EqualTo(decoy.IsContaminant)); + Assert.That(target.DatabaseFilePath, Is.EqualTo(decoy.DatabaseFilePath)); + Assert.That(target.DatabaseFilePath, Is.EqualTo(ModomicsUnmodifiedFastaPath)); + Assert.That(target.Organism, Is.EqualTo(decoy.Organism)); + Assert.That(target.Accession, Is.EqualTo(decoy.Accession)); + Assert.That(target.Name, Is.EqualTo(decoy.Name)); + Assert.That(target.Length, Is.EqualTo(decoy.Length)); + Assert.That(target.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(decoy.OneBasedPossibleLocalizedModifications.Count)); + + Assert.That(decoy.BaseSequence, Is.EqualTo(expectedSequence)); + } + } + + + // TODO: Implement these test once other decoy generation methods are availiable + + [Test] + public void TestShuffledDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + }); + + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestShuffledDecoy_SimpleWithMods() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + }); + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestShuffledDecoy_FromDatabase() + { + Assert.Throws(() => + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + }); + + //var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + //Assert.That(errors.Count, Is.EqualTo(0)); + //Assert.That(oligos.Count, Is.EqualTo(10)); + } + + [Test] + public void TestSlideDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + }); + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestSlideDecoy_SimpleWithMods() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + }); + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestSlideDecoy_FromDatabase() + { + Assert.Throws(() => + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + }); + + //var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Slide, false, out var errors); + //Assert.That(errors.Count, Is.EqualTo(0)); + //Assert.That(oligos.Count, Is.EqualTo(10)); + } + + + [Test] + public void TestCreateNew() + { + var mods = PtmListLoader.ReadModsFromString( + "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> modsOut).ToList(); + var modDict = mods.ToDictionary(p => p.IdWithMotif, p => p); + var oneBasedPossibleLocalizedModifications = new Dictionary>() + { + { 1, new List() { modDict["Sodium on A"] } }, + { 3, new List() { modDict["Sodium on A"] } }, + }; + + var rna = new RNA("GAACUG", "name", "accession", "organism", "databaseFilePath", + null, null, oneBasedPossibleLocalizedModifications, false, false, new List>(), + new Dictionary()); + var oligos = rna + .Digest(new RnaDigestionParams(maxMods: 1), new List(), mods) + .ToList(); + + var clonedRna = rna.CreateNew(null, null, true); + var clonedOligo = oligos.First().CreateNew(null, null, true); + + // ensure they are identical except for the isDecoy field + Assert.That(rna.BaseSequence, Is.EqualTo(clonedRna.BaseSequence)); + Assert.That(rna.OneBasedPossibleLocalizedModifications, Is.EqualTo(clonedRna.OneBasedPossibleLocalizedModifications)); + Assert.That(rna.IsDecoy, Is.Not.EqualTo(clonedRna.IsDecoy)); + + Assert.That(oligos.First().BaseSequence, Is.EqualTo(clonedOligo.BaseSequence)); + Assert.That(oligos.First().OneBasedPossibleLocalizedModifications, Is.EqualTo(clonedOligo.OneBasedPossibleLocalizedModifications)); + Assert.That(oligos.First().Parent.IsDecoy, Is.Not.EqualTo(clonedOligo.Parent.IsDecoy)); + + + var newMods = new Dictionary>() + { + { 1, new List() { modDict["Sodium on A"] } }, + { 2, new List() { modDict["Sodium on A"] } }, + { 3, new List() { modDict["Sodium on A"] } }, + }; + clonedRna = rna.CreateNew("AAAAAA", newMods, null); + clonedOligo = oligos.First().CreateNew("AAAAAA", newMods, null); + + Assert.That(rna.BaseSequence, Is.Not.EqualTo(clonedRna.BaseSequence)); + Assert.That(rna.OneBasedPossibleLocalizedModifications, Is.Not.EqualTo(clonedRna.OneBasedPossibleLocalizedModifications)); + Assert.That(rna.IsDecoy, Is.EqualTo(clonedRna.IsDecoy)); + + Assert.That(oligos.First().BaseSequence, Is.Not.EqualTo(clonedOligo.BaseSequence)); + Assert.That(oligos.First().OneBasedPossibleLocalizedModifications, Is.Not.EqualTo(clonedOligo.OneBasedPossibleLocalizedModifications)); + Assert.That(oligos.First().Parent.IsDecoy, Is.EqualTo(clonedOligo.Parent.IsDecoy)); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs new file mode 100644 index 000000000..acfcacdef --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -0,0 +1,1186 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using Chemistry; +using MassSpectrometry; +using MzLibUtil; +using NUnit.Framework; +using Omics; +using Omics.Digestion; +using Omics.Fragmentation; +using Omics.Modifications; +using Transcriptomics; +using Transcriptomics.Digestion; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestDigestion + { + public record RnaDigestionTestCase(string BaseSequence, string Enzyme, int MissedCleavages, int MinLength, + int MaxLength, int DigestionProductCount, + double[] MonoMasses, string[] Sequences); + + public static IEnumerable GetTestCases() + { + // 6bp Top Down + yield return new RnaDigestionTestCase("GUACUG", "top-down", + 0, 1, 6, 1, + new[] { 1874.28 }, + new[] { "GUACUG" }); + // 6bp Rnase T1, normal + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 1, 6, 2, + new[] { 363.057, 1529.234 }, + new[] { "G", "UACUG" }); + // 6bp Cusativin, normal + yield return new RnaDigestionTestCase("GUACUG", "Cusativin", + 0, 1, 6, 2, + new[] { 1303.175, 589.116 }, + new[] { "GUAC", "UG" }); + // 6bp Rnase T1, one product too short + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 3, 6, 1, + new[] { 1529.234 }, + new[] { "UACUG" }); + // 6bp Rnase T1, one product too long + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 1, 2, 1, + new[] { 363.057 }, + new[] { "G" }); + // 6bp Rnase T1, 1 missed cleavage + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 1, 1, 6, 3, + new[] { 363.057, 1529.234, 1874.28 }, + new[] { "G", "UACUG", "GUACUG" }); + // 6bp Rnase A + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 0, 1, 6, 4, + new[] { 669.082, 652.103, 324.035, 283.091 }, + new[] { "GU", "AC", "U", "G" }); + // 6bp Rnase A, 1 missed cleavage + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 1, 1, 6, 7, + new[] { 669.082, 652.103, 324.035, 283.091, 1303.175, 958.128, 589.116 }, + new[] { "GU", "AC", "U", "G", "GUAC", "ACU", "UG" }); + // 6bp Rnase A, 2 missed cleavages + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 2, 1, 6, 9, + new[] { 669.082, 652.103, 324.035, 283.091, 1303.175, 958.128, 589.116, 1609.200, 1223.209 }, + new[] { "GU", "AC", "U", "G", "GUAC", "ACU", "UG", "GUACU", "ACUG" }); + // 20bp top-down + yield return new RnaDigestionTestCase("GUACUGCCUCUAGUGAAGCA", "top-down", + 0, 1, int.MaxValue, 1, + new[] { 6363.871 }, + new[] { "GUACUGCCUCUAGUGAAGCA" }); + // 20bp Rnase T1, normal + yield return new RnaDigestionTestCase("GUACUGCCUCUAGUGAAGCA", "RNase T1", + 0, 1, int.MaxValue, 6, + new[] { 363.057, 1609.200, 2219.282, 669.082, 1021.161, 572.137 }, + new[] { "G", "UACUG", "CCUCUAG", "UG", "AAG", "CA" }); + } + + public static string rnaseTsvpath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"Digestion\rnases.tsv"); + + [OneTimeSetUp] + public void OneTimeSetup() + { + RnaseDictionary.Dictionary = RnaseDictionary.LoadRnaseDictionary(rnaseTsvpath); + } + + #region Rnase + + [Test] + public void TestRnaseDictionaryLoading() + { + var rnaseCountFromTsv = File.ReadAllLines(rnaseTsvpath).Length - 1; + Assert.That(RnaseDictionary.Dictionary.Count, Is.EqualTo(rnaseCountFromTsv)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestRnase_GetUnmodifiedOligos_Counts(RnaDigestionTestCase testCase) + { + RNA rna = new RNA(testCase.BaseSequence); + Rnase rnase = RnaseDictionary.Dictionary[testCase.Enzyme]; + var digestionProducts = + rnase.GetUnmodifiedOligos(rna, testCase.MissedCleavages, testCase.MinLength, testCase.MaxLength); + + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestRnase_GetUnmodifiedOligo_Sequence(RnaDigestionTestCase testCase) + { + RNA rna = new RNA(testCase.BaseSequence); + Rnase rnase = RnaseDictionary.Dictionary[testCase.Enzyme]; + var digestionProducts = + rnase.GetUnmodifiedOligos(rna, testCase.MissedCleavages, testCase.MinLength, testCase.MaxLength); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var product = digestionProducts[i]; + var testCaseCaseSequence = testCase.Sequences[i]; + Assert.That(product.BaseSequence == testCaseCaseSequence); + } + } + + [Test] + public void TestRnaseEqualityProperties() + { + Rnase t1 = RnaseDictionary.Dictionary["RNase T1"]; + Rnase t1Duplicate = RnaseDictionary.Dictionary["RNase T1"]; + Rnase t2 = RnaseDictionary.Dictionary["RNase T2"]; + + Assert.That(t1.Equals(t1Duplicate)); + Assert.That(t1.Equals(t1)); + Assert.That(!t1.Equals(t2)); + Assert.That(!t1.Equals(null)); + Assert.That(t1.GetHashCode(), Is.EqualTo(t1Duplicate.GetHashCode())); + Assert.That(t1.GetHashCode(), Is.Not.EqualTo(t2.GetHashCode())); + Assert.That(t1.Equals((object)t1Duplicate)); + Assert.That(t1.Equals((object)t1)); + Assert.That(!t1.Equals((object)t2)); + Assert.That(!t1.Equals((object)null)); + // ReSharper disable once SuspiciousTypeConversion.Global + Assert.That(!t1.Equals((object)new RNA("GUA"))); + } + + [Test] + public void TestRnase_UnmodifiedOligos_Exception() + { + Rnase rnase = new Rnase("Bad", CleavageSpecificity.SingleC, new List()); + Assert.Throws(() => { rnase.GetUnmodifiedOligos(new RNA("GUACUG"), 0, 1, 6); }); + } + + #endregion + + #region NucleolyticOligo + + [Test] + public void TestNucleolyticOligoProperties_FivePrimeDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + var oligo = digestionProducts[0]; + Assert.That(oligo.BaseSequence, Is.EqualTo("G")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(1)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(1)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('U')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('-')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_ThreePrimeDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + NucleolyticOligo oligo = digestionProducts[2]; + Assert.That(oligo.BaseSequence, Is.EqualTo("CUG")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(4)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(6)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('-')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('A')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_InternalDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + NucleolyticOligo oligo = digestionProducts[1]; + Assert.That(oligo.BaseSequence, Is.EqualTo("UA")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(2)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(3)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('C')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('G')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_TopDownDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["top-down"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(1)); + + NucleolyticOligo oligo = digestionProducts[0]; + Assert.That(oligo.BaseSequence, Is.EqualTo("GUACUG")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(1)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(6)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('-')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('-')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + #endregion + + #region OligoWithSetMods + + private static (string Sequence, int FragmentNumber, ProductType Type, double Mass)[] DigestFragmentTestCases => + new (string Sequence, int FragmentNumber, ProductType Type, double Mass)[] + { + ("UAG", 0, ProductType.M, 998.134), + ("UAG", 1, ProductType.aBaseLoss, 114.031), ("UAG", 2, ProductType.aBaseLoss, 420.056), + ("UAG", 1, ProductType.c, 308.031), ("UAG", 2, ProductType.c, 637.093), + ("UAG", 1, ProductType.dWaterLoss, 306.025), ("UAG", 2, ProductType.dWaterLoss, 635.077), + ("UAG", 1, ProductType.w, 443.023), ("UAG", 2, ProductType.w, 772.075), + ("UAG", 1, ProductType.y, 363.057), ("UAG", 2, ProductType.y, 692.109), + ("UAG", 1, ProductType.yWaterLoss, 345.047), ("UAG", 2, ProductType.yWaterLoss, 674.100), + + ("UCG", 0, ProductType.M, 974.123), + ("UCG", 1, ProductType.aBaseLoss, 114.031), ("UCG", 2, ProductType.aBaseLoss, 420.056), + ("UCG", 1, ProductType.c, 308.040), ("UCG", 2, ProductType.c, 613.082), + ("UCG", 1, ProductType.dWaterLoss, 306.025), ("UCG", 2, ProductType.dWaterLoss, 611.066), + ("UCG", 1, ProductType.w, 443.023), ("UCG", 2, ProductType.w, 748.064), + ("UCG", 1, ProductType.y, 363.057), ("UCG", 2, ProductType.y, 668.098), + ("UCG", 1, ProductType.yWaterLoss, 345.047), ("UCG", 2, ProductType.yWaterLoss, 650.089), + + ("UUG", 0, ProductType.M, 975.107), + ("UUG", 1, ProductType.aBaseLoss, 114.031), ("UUG", 2, ProductType.aBaseLoss, 420.056), + ("UUG", 1, ProductType.c, 308.041), ("UUG", 2, ProductType.c, 614.066), + ("UUG", 1, ProductType.dWaterLoss, 306.025), ("UUG", 2, ProductType.dWaterLoss, 612.050), + ("UUG", 1, ProductType.w, 443.023), ("UUG", 2, ProductType.w, 749.048), + ("UUG", 1, ProductType.y, 363.057), ("UUG", 2, ProductType.y, 669.082), + ("UUG", 1, ProductType.yWaterLoss, 345.047), ("UUG", 2, ProductType.yWaterLoss, 651.073), + + ("AUAG", 0, ProductType.M, 1247.220), + ("AUAG", 1, ProductType.aBaseLoss, 114.031), ("AUAG", 2, ProductType.aBaseLoss, 443.083), ("AUAG", 3, ProductType.aBaseLoss, 749.108), + ("AUAG", 1, ProductType.c, 331.068), ("AUAG", 2, ProductType.c, 637.093), ("AUAG", 3, ProductType.c, 966.146), + ("AUAG", 1, ProductType.dWaterLoss, 329.052), ("AUAG", 2, ProductType.dWaterLoss, 635.077), ("AUAG", 3, ProductType.dWaterLoss, 964.129), + ("AUAG", 1, ProductType.w, 363.057), ("AUAG", 2, ProductType.w, 692.109), ("AUAG", 3, ProductType.w, 998.134), + ("AUAG", 1, ProductType.y, 283.091), ("AUAG", 2, ProductType.y, 612.143), ("AUAG", 3, ProductType.y, 918.168), + ("AUAG", 1, ProductType.yWaterLoss, 265.081), ("AUAG", 2, ProductType.yWaterLoss, 594.134), ("AUAG", 3, ProductType.yWaterLoss, 900.159), + }; + + [Test] // test values calculated with http://rna.rega.kuleuven.be/masspec/mongo.htm + [TestCase("UAGUCGUUGAUAG", 4140.555, new[] { "UAG", "UCG", "UUG", "AUAG" }, + new[] { 998.134, 974.123, 975.107, 1247.220 })] + public static void TestDigestionAndFragmentation(string sequence, double monoMass, + string[] digestionProductSequences, double[] digestionProductMasses) + { + RNA rna = new(sequence); + Assert.That(rna.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + + // digest RNA + var digestionParams = new RnaDigestionParams("RNase T1"); + var products = rna.Digest(digestionParams, new List(), new List()) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(products.Count, Is.EqualTo(digestionProductSequences.Length)); + + // ensure digestion sequence and masses are correct + for (var index = 0; index < products.Count; index++) + { + var digestionProduct = products[index]; + Assert.That(digestionProduct.BaseSequence, Is.EqualTo(digestionProductSequences[index])); + Assert.That(digestionProduct.MonoisotopicMass, Is.EqualTo(digestionProductMasses[index]).Within(0.01)); + + List fragments = new(); + digestionProduct.Fragment(DissociationType.CID, FragmentationTerminus.Both, fragments); + + // test that fragments are correct + var fragmentsToCompare = DigestFragmentTestCases + .Where(p => p.Sequence.Equals(digestionProduct.BaseSequence)).ToList(); + for (var i = 0; i < fragments.Count; i++) + { + var fragment = fragments[i]; + var theoreticalFragment = fragmentsToCompare.FirstOrDefault(p => + p.FragmentNumber == fragment.FragmentNumber && p.Type == fragment.ProductType); + if (theoreticalFragment.Mass is 0.0 ) continue; + Assert.That(fragment.MonoisotopicMass, Is.EqualTo(theoreticalFragment.Mass).Within(0.01)); + Assert.That(fragment.FragmentNumber, Is.EqualTo(theoreticalFragment.FragmentNumber)); + Assert.That(fragment.ProductType, Is.EqualTo(theoreticalFragment.Type)); + Assert.That(fragment.FragmentNumber, Is.EqualTo(theoreticalFragment.FragmentNumber)); + if (fragment.Terminus == FragmentationTerminus.FivePrime) + Assert.That(fragment.AminoAcidPosition, Is.EqualTo(theoreticalFragment.FragmentNumber)); + else if (fragment.Terminus == FragmentationTerminus.None) + Assert.That(fragment.FragmentNumber, Is.EqualTo(0)); + else + Assert.That(fragment.AminoAcidPosition, Is.EqualTo(digestionProductSequences[index].Length - theoreticalFragment.FragmentNumber)); + } + } + } + + [Test] + [TestCase("UAGUCGUUGAUAG", new[] { "UAG", "UCG", "UUG", "AUAG" }, + new[] { 1, 4, 7, 10 }, new[] { 3, 6, 9, 13 }, new[] { '-', 'G', 'G', 'G' }, + new[] { 'U', 'U', 'A', '-' })] + public static void TestOligoWithSetMods_AAPositions(string sequence, string[] digestionProductSequences, + int[] startResidue, int[] endResidue, char[] preciousResidue, char[] nextResidue) + { + RNA rna = new RNA(sequence); + var digestionProducts = rna.Digest(new RnaDigestionParams("RNase T1"), new List(), + new List()).Select(p => (OligoWithSetMods)p).ToList(); + + Assert.That(digestionProducts.All(p => p.DigestionParams.DigestionAgent.Name == "RNase T1")); + for (var index = 0; index < digestionProducts.Count; index++) + { + var digestionProduct = digestionProducts[index]; + Assert.That(digestionProduct.BaseSequence, Is.EqualTo(digestionProductSequences[index])); + Assert.That(digestionProduct.OneBasedStartResidue, Is.EqualTo(startResidue[index])); + Assert.That(digestionProduct.OneBasedEndResidue, Is.EqualTo(endResidue[index])); + Assert.That(digestionProduct.PreviousResidue, Is.EqualTo(preciousResidue[index])); + Assert.That(digestionProduct.NextResidue, Is.EqualTo(nextResidue[index])); + } + } + + [Test] + public static void TestTermini_ThreePrimeCyclicPhosphate() + { + string sequence = "UAGUCGUUGAUAG"; + RNA rna = new RNA(sequence); + var oligoCyclicPhosphate = PtmListLoader.ReadModsFromString( + "ID Cyclic Phosphate\r\nTG X\r\nPP Oligo 3'-terminal.\r\nMT Digestion Termini\r\nCF H-2 O-1\r\nDR Unimod; 280.\r\n//", + out List<(Modification, string)> errors).First(); + var nucleicAcidCyclicPhosphate = PtmListLoader.ReadModsFromString( + "ID Cyclic Phosphate\r\nTG X\r\nPP 3'-terminal.\r\nMT Digestion Termini\r\nCF H-2 O-1\r\nDR Unimod; 280.\r\n//", + out errors).First(); + Assert.That(!errors.Any()); + + // top-down digestion, 3' terminal modification + var variableMods = new List { nucleicAcidCyclicPhosphate }; + var digestionParams = new RnaDigestionParams("top-down"); + var digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(2)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[0].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[0].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG")); + + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + Assert.That(digestionProducts[1].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG[H-2O-1]")); + Assert.That(digestionProducts[1].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG[-18.010565]")); + + // top-down digestion, 3' oligo terminal modification + variableMods = new List { oligoCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(1)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + + // RNase T1 digestion, 3' terminal modification + digestionParams = new RnaDigestionParams("RNase T1"); + variableMods = new List { nucleicAcidCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(5)); + var expected = new List() + { + "UAG", "UCG", "UUG", "AUAG", "AUAG[Digestion Termini:Cyclic Phosphate on X]" + }; + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + + // RNase T1 digestion, 3' oligo terminal modification + variableMods = new List { oligoCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(7)); + expected = new List() + { + "UAG", "UAG[Digestion Termini:Cyclic Phosphate on X]", + "UCG", "UCG[Digestion Termini:Cyclic Phosphate on X]", + "UUG", "UUG[Digestion Termini:Cyclic Phosphate on X]", + "AUAG", + }; + + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + } + + [Test] + public static void TestTermini_FivePrimeLargeMod() + { + string sequence = "UAGUCGUUGAUAG"; + RNA rna = new RNA(sequence); + var oligoLargeMod = PtmListLoader.ReadModsFromString( + "ID Pfizer 5'-Cap\r\nTG X\r\nPP Oligo 5'-terminal.\r\nMT Standard\r\nCF C13H22N5O14P3\r\nDR Unimod; 280.\r\n//", + out List<(Modification, string)> errors).First(); + var nucleicAcidLargeMod = PtmListLoader.ReadModsFromString( + "ID Pfizer 5'-Cap\r\nTG X\r\nPP 5'-terminal.\r\nMT Standard\r\nCF C13H22N5O14P3\r\nDR Unimod; 280.\r\n//", + out errors).First(); + Assert.That(!errors.Any()); + + // top-down digestion, 5' terminal modification + var variableMods = new List { nucleicAcidLargeMod }; + var digestionParams = new RnaDigestionParams("top-down"); + var digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(2)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("[Standard:Pfizer 5'-Cap on X]UAGUCGUUGAUAG")); + + // top-down digestion, 5' oligo terminal modification + variableMods = new List { oligoLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(1)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + + // RNase T1 digestion, 5' terminal modification + digestionParams = new RnaDigestionParams("RNase T1"); + variableMods = new List { nucleicAcidLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(5)); + var expected = new List() + { + "UAG", "[Standard:Pfizer 5'-Cap on X]UAG", "UCG", "UUG", "AUAG" + }; + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + + // RNase T1 digestion, 5' oligo terminal modification + variableMods = new List { oligoLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(7)); + expected = new List() + { + "UAG", + "UCG", "[Standard:Pfizer 5'-Cap on X]UCG", + "UUG", "[Standard:Pfizer 5'-Cap on X]UUG", + "AUAG", "[Standard:Pfizer 5'-Cap on X]AUAG" + }; + + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + } + + [Test] + [TestCase("UAGUCGUUGAUAG")] + public static void TestOligoWithSetMods_PropertiesWithTopDownDigestion(string sequence) + { + var rna = new RNA(sequence); + var oligoWithSetMods = + rna.Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + Assert.That(rna.BaseSequence, Is.EqualTo(oligoWithSetMods.BaseSequence)); + Assert.That(rna.ThreePrimeTerminus, Is.EqualTo(oligoWithSetMods.ThreePrimeTerminus)); + Assert.That(rna.FivePrimeTerminus, Is.EqualTo(oligoWithSetMods.FivePrimeTerminus)); + Assert.That(rna.ThisChemicalFormula, Is.EqualTo(oligoWithSetMods.ThisChemicalFormula)); + Assert.That(rna.Length, Is.EqualTo(oligoWithSetMods.Length)); + } + + [Test] + public static void OligoWithSetMods_CalculatedValues() + { + var rna = new RNA("GUACUG"); + var rnaFormula = rna.ThisChemicalFormula; + + string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//"; + var sodiumAdduct = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods).First(); + var oligoWithSetMods = + rna.Digest(new RnaDigestionParams(), new List() { sodiumAdduct }, new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + Assert.That(oligoWithSetMods.NumMods, Is.EqualTo(1)); + Assert.That(oligoWithSetMods.NumFixedMods, Is.EqualTo(1)); + Assert.That(oligoWithSetMods.NumVariableMods, Is.EqualTo(0)); + Assert.That(oligoWithSetMods.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + + var formula = oligoWithSetMods.ThisChemicalFormula; + Assert.That(formula, Is.EqualTo(rnaFormula + sodiumAdduct.ChemicalFormula)); + + var formulaToAdd = ChemicalFormula.ParseFormula("H"); + var deltaMass = formulaToAdd.MonoisotopicMass; + var oldMonoMass = oligoWithSetMods.MonoisotopicMass; + var oldMostAbundantMass = oligoWithSetMods.MostAbundantMonoisotopicMass; + + oligoWithSetMods.FivePrimeTerminus = formulaToAdd + oligoWithSetMods.FivePrimeTerminus; + + Assert.That(oligoWithSetMods.MonoisotopicMass, Is.EqualTo(oldMonoMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.MostAbundantMonoisotopicMass, Is.EqualTo(oldMostAbundantMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.ThisChemicalFormula, Is.EqualTo(formula + formulaToAdd)); + + oldMonoMass = oligoWithSetMods.MonoisotopicMass; + oldMostAbundantMass = oligoWithSetMods.MostAbundantMonoisotopicMass; + oligoWithSetMods.ThreePrimeTerminus = formulaToAdd + oligoWithSetMods.ThreePrimeTerminus; + + Assert.That(oligoWithSetMods.MonoisotopicMass, Is.EqualTo(oldMonoMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.MostAbundantMonoisotopicMass, Is.EqualTo(oldMostAbundantMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.ThisChemicalFormula, Is.EqualTo(formula + formulaToAdd + formulaToAdd)); + + Assert.Throws(() => + { + var oligo = new OligoWithSetMods("GUA|GAUGUC", new Dictionary()); + }); + } + + #endregion + + #region DigestionParams + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestDigestionParams_Properties(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(RnaseDictionary.Dictionary[testCase.Enzyme])); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(testCase.MissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(testCase.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(testCase.MaxLength)); + + digestionParams.MaxModificationIsoforms = 2048; + digestionParams.MaxMods = 3; + Assert.That(digestionParams.MaxModificationIsoforms, Is.EqualTo(2048)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(3)); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()); + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + [Test] + public void TestDigestionParamsClone() + { + var digestionParams = new RnaDigestionParams("top-down", 0, 3, 20000); + var cloned = digestionParams.Clone(FragmentationTerminus.C); + + // set new terminus, all values except terminus are retained + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(cloned.DigestionAgent)); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(cloned.MaxMissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(cloned.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(cloned.MaxLength)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(cloned.MaxMods)); + Assert.That(digestionParams.FragmentationTerminus, Is.Not.EqualTo(cloned.FragmentationTerminus)); + Assert.That(digestionParams.SearchModeType, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(cloned.FragmentationTerminus, Is.EqualTo(FragmentationTerminus.C)); + + // do not set new terminus, all values are retained + cloned = digestionParams.Clone(); + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(cloned.DigestionAgent)); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(cloned.MaxMissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(cloned.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(cloned.MaxLength)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(cloned.MaxMods)); + Assert.That(digestionParams.FragmentationTerminus, Is.EqualTo(cloned.FragmentationTerminus)); + Assert.That(digestionParams.SearchModeType, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(cloned.FragmentationTerminus, Is.EqualTo(FragmentationTerminus.Both)); + } + + #endregion + + #region NucleicAcid + + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_Counts(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()); + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_Sequences(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()) + .ToList(); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var product = digestionProducts[i]; + var testCaseCaseSequence = testCase.Sequences[i]; + Assert.That(product.BaseSequence, Is.EqualTo(testCaseCaseSequence)); + Assert.That(product.FullSequence, Is.EqualTo(testCaseCaseSequence)); + } + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_MonoMasses(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()) + .ToList(); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var productMass = digestionProducts[i].MonoisotopicMass; + var testCaseCaseMass = testCase.MonoMasses[i]; + Assert.That(productMass, Is.EqualTo(testCaseCaseMass).Within(0.01)); + } + } + + [Test] + public static void TestNucleicAcid_Digestion_Exception() + { + IDigestionParams digestionParams = new Proteomics.ProteolyticDigestion.DigestionParams(); + var rna = new RNA("GUACUGGUACUG"); + + try + { + var result = rna.Digest(digestionParams, new List(), new List()); + } + catch (Exception e) + { + Assert.That(e, Is.TypeOf()); + Assert.That(e.InnerException, Is.TypeOf()); + } + } + + #endregion + + #region Digestion with Modifications + + public static List SodiumAdducts => + PtmListLoader.ReadModsFromString("ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List PotassiumAdducts => + PtmListLoader.ReadModsFromString("ID Potassium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF K1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List TerminalSodiumAdducts => + PtmListLoader.ReadModsFromString("ID Sodium\r\nMT Metal\r\nPP 3'-terminal.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List TerminalPotassiumAdducts => + PtmListLoader.ReadModsFromString("ID Potassium\r\nMT Metal\r\nPP 5'-terminal.\r\nTG A or C or G or U\r\nCF K1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + [Test] + public static void TestVariableModsCountCorrect() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams() + { + MaxMods = 1, + }; + + var precursors = rna.Digest(rnaDigestionParams, new List(), SodiumAdducts) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(7)); + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + + rnaDigestionParams.MaxMods = 2; + precursors = rna.Digest(rnaDigestionParams, new List(), SodiumAdducts) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(22)); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]U[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]A[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]AC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]C[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]U[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + } + + [Test] + public static void TestFixedModsCountCorrect() + { + var sodiumAdduct = new List() { SodiumAdducts[0] }; + + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams() + { + MaxMods = 1, + }; + var precursors = rna.Digest(rnaDigestionParams, sodiumAdduct, new List()) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.First().NumFixedMods, Is.EqualTo(1)); + Assert.That(precursors.First().FullSequence, Is.EqualTo("GUA[Metal:Sodium on A]CUG")); + Assert.That(precursors.First().MonoisotopicMass, Is.EqualTo(1896.26).Within(0.01)); + + sodiumAdduct = new List() { SodiumAdducts[2] }; + + precursors = rna.Digest(rnaDigestionParams, sodiumAdduct, new List()) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.First().NumFixedMods, Is.EqualTo(2)); + Assert.That(precursors.First().FullSequence, Is.EqualTo("G[Metal:Sodium on G]UACUG[Metal:Sodium on G]")); + Assert.That(precursors.First().MonoisotopicMass, Is.EqualTo(1918.25).Within(0.01)); + } + + [Test] + public static void TestFixedAndVariableMods() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + rnaDigestionParams.MaxMods = 1; + var fixedMods = new List { PotassiumAdducts[0] }; // A + var variableMods = new List { SodiumAdducts[1] }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.All(p => p.NumFixedMods == 1)); + Assert.That(fullSequences.Contains("GUA[Metal:Potassium on A]CUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Potassium on A]C[Metal:Sodium on C]UG")); + + var oneOfEach = precursors.First(p => p.FullSequence.Equals("GUA[Metal:Potassium on A]C[Metal:Sodium on C]UG")); + Assert.That(oneOfEach.NumFixedMods, Is.EqualTo(1)); + Assert.That(oneOfEach.NumVariableMods, Is.EqualTo(1)); + Assert.That(oneOfEach.NumMods, Is.EqualTo(2)); + + fixedMods = new List { PotassiumAdducts[2] }; // G + variableMods = new List { SodiumAdducts[1] }; // C + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + + fixedMods = new List { PotassiumAdducts[2] }; // G + variableMods = new List { SodiumAdducts[1], SodiumAdducts[3] }; // C, U + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(4)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACUG[Metal:Potassium on G]")); + + rnaDigestionParams.MaxMods = 2; + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(7)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]U[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]AC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + } + + /// + /// Test when one fixed and one variable mod are used and share a localization + /// expect two results, one with the fixed, and one with the variable + /// + [Test] + public static void TestFixedAndVariableMods_LocalizationOverlap() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { PotassiumAdducts[1] }; // C + var variableMods = new List { SodiumAdducts[1] }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.Any(p => p.NumFixedMods == 1)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 0)); + Assert.That(precursors.All(p => p.NumMods == 1)); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when two variable mods are used and share a localization + /// expect three results, one unmodified, and two singly modified + /// + [Test] + public static void TestVariableMods_LocalizationOverlap() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { PotassiumAdducts[1], SodiumAdducts[1] }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + // expect three results, one unmodified, and two singly modified + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(3)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when one modification is annotated in the database, out of bounds + /// expect two results, one with the fixed, and one with the variable + /// + [Test] + public static void TestDatabaseAnnotatedMods_OutOfBounds() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 23, new List() { PotassiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + Assert.That(precursors.All(p => p.NumVariableMods == 0)); + Assert.That(precursors.All(p => p.NumMods == 0)); + Assert.That(fullSequences.Contains("GUACUG")); + } + } + + /// + /// Test when one modification is annotated in the database + /// expect two results, one unmodified, and one singly modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_SingleModification() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 4, new List() { PotassiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors[0].NumMods, Is.EqualTo(0)); + Assert.That(precursors[1].NumMods, Is.EqualTo(1)); + Assert.That(precursors[1].NumVariableMods, Is.EqualTo(1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + } + } + + /// + /// Test when two modifications are annotated in the database at the same location + /// expect three results, one unmodified, and two singly modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_LocalizationOverlap() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 4, new List() { PotassiumAdducts[1], SodiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(3)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when two terminal modifications are annotated in the database + /// MaxMods 1: expect three results, one unmodified, and two singly modified + /// MaxMods 2: expect four results, one unmodified, and two singly modified, and one double modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2]} }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + // expect three results, one unmodified, and two singly modified + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2 + i)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + + if (rnaDigestionParams.MaxMods != 2) continue; + Assert.That(precursors.Any(p => p.NumVariableMods == 2)); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + } + } + + /// + /// Test when two terminal modifications are annotated in the database and one database mod on first residue + /// MaxMods 1: expect four results, one unmodified, and three singly modified + /// MaxMods 2: expect seven results, one unmodified, and three singly modified, and three double modified + /// MaxMods 3: expect eight results, one unmodified, and three singly modified, and three double modified, and one triply modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueDatabaseMod() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2], PotassiumAdducts[2] } }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database and one database mod on first residue + for (int i = 1; i < 4; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + + switch (rnaDigestionParams.MaxMods) + { + case 1: + Assert.That(precursors.Count(), Is.EqualTo(4)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods == 1)); + break; + case 2: + Assert.That(precursors.Count(), Is.EqualTo(7)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + + case 3: + Assert.That(precursors.Count(), Is.EqualTo(8)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + } + + if (rnaDigestionParams.MaxMods >= 1) + { + + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + } + else if (rnaDigestionParams.MaxMods >= 2) + { + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); + } + else if (rnaDigestionParams.MaxMods >= 3) + { + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + } + } + } + + /// + /// Test when two terminal modifications are annotated in the database and one database mod on first residue + /// MaxMods 1: expect five results, one unmodified, and four singly modified + /// MaxMods 2: expect eleven results, one unmodified, and four singly modified, and six double modified + /// MaxMods 3: expect fifteen results, one unmodified, and four singly modified, and six double modified, and four triply modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueVariableMod() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2] } }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database and one database mod on first residue + for (int i = 1; i < 4; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { PotassiumAdducts[2] }; + + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + + switch (rnaDigestionParams.MaxMods) + { + case 1: + Assert.That(precursors.Count(), Is.EqualTo(5)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods == 1)); + break; + case 2: + Assert.That(precursors.Count(), Is.EqualTo(11)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + + case 3: + Assert.That(precursors.Count(), Is.EqualTo(15)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + } + + if (rnaDigestionParams.MaxMods >= 1) + { + + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + } + else if (rnaDigestionParams.MaxMods >= 2) + { + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]")); + } + else if (rnaDigestionParams.MaxMods >= 3) + { + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G][Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); + } + } + } + + [Test] + public static void TestDigestionMaxIsoforms() + { + var rna = new RNA("GUACUAGACUACAUGGUACAUCA"); + var rnaDigestionParams = new RnaDigestionParams(); + var variableMods = SodiumAdducts.Concat(PotassiumAdducts) + .Concat(TerminalPotassiumAdducts).Concat(TerminalSodiumAdducts).ToList(); + + var digestionProducts = rna.Digest(rnaDigestionParams, new List(), variableMods) + .ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(rnaDigestionParams.MaxModificationIsoforms)); + } + + #endregion + } +} diff --git a/mzLib/Test/Transcriptomics/TestFragmentation.cs b/mzLib/Test/Transcriptomics/TestFragmentation.cs new file mode 100644 index 000000000..76ddb8c3b --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestFragmentation.cs @@ -0,0 +1,244 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Transcriptomics; +using MassSpectrometry; +using Omics; +using Omics.Fragmentation; +using Omics.Fragmentation.Oligo; +using Omics.Modifications; +using Transcriptomics.Digestion; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestFragmentation + { + + public static IEnumerable GetSixMerIndividualFragmentTypeTestCases() => + TestNucleicAcid.GetSixmerIndividualFragmentTypeTestCases(); + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestGetNeutralFragments(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + var neutralFragments = rna.GetNeutralFragments(testCase.Type).ToList(); + for (int i = 1; i < neutralFragments.Count; i++) + { + Assert.That(neutralFragments[i].NeutralMass, Is.EqualTo(testCase.NeutralMasses[i]).Within(0.01)); + } + } + + + private static IEnumerable ImplementedDissociationTypes + { + get + { + Loaders.LoadElements(); + foreach (var type in DissociationTypeCollection.AllImplementedDissociationTypes) + yield return type; + } + } + + /// + /// This test makes the assumption that the M ion is a component of all product types + /// + /// + [Test] + [TestCaseSource(nameof(ImplementedDissociationTypes))] + public void TestFragmentation_Unmodified_ProductCountsAreCorrect(DissociationType type) + { + Loaders.LoadElements(); + List products = new(); + var rnaToTest = new List + { + new RNA("GUACUG"), + new RNA("GUACUGCACUGU"), + new RNA("GUACUGUAAUGAGACUAGUACAUGACAUG"), + }; + var terminiToTest = new List { FragmentationTerminus.Both, FragmentationTerminus.FivePrime, FragmentationTerminus.ThreePrime }; + var potentialProducts = type.GetRnaProductTypesFromDissociationType(); + + // test with top down digestion and no modifications + var digestionparams = new RnaDigestionParams(rnase: "top-down"); + var fixedMods = new List(); + var variableMods = new List(); + foreach (var term in terminiToTest) + { + foreach (var oligoWithSetMods in rnaToTest.Select(rna => rna.Digest(digestionparams, fixedMods, variableMods).First())) + { + var terminalSpecifc = term == FragmentationTerminus.Both + ? potentialProducts + : potentialProducts.Where(p => p.GetRnaTerminusType() == term).ToList(); + + var expectedProductCount = term == FragmentationTerminus.Both + ? (oligoWithSetMods.Length - 1) * (terminalSpecifc.Count - 1) + 1 // there is only one M ion, so for both, remove that form muliplier and add one + : (oligoWithSetMods.Length - 1) * terminalSpecifc.Count; + + oligoWithSetMods.Fragment(type, term, products); + Assert.That(products.Count, Is.EqualTo(expectedProductCount)); + Assert.That(products.All(p => terminalSpecifc.Contains(p.ProductType))); + } + } + } + + [Test] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.a, + new[] { 267.089, 573.114, 902.167, 1207.208, 1513.233 }, + new[] { 267.089, 573.114, 902.167 + 21.982, 1207.208 + 21.982, 1513.233 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.b, + new[] { 283.084, 589.109, 918.162, 1223.203, 1529.228 }, + new[] { 283.084, 589.109, 918.162 + 21.982, 1223.203 + 21.982, 1529.228 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.c, + new[] { 347.055, 653.081, 982.133, 1287.174, 1593.2 }, + new[] { 347.055, 653.081, 982.133 + 21.982, 1287.174 + 21.982, 1593.2 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.d, + new[] { 363.05, 669.075, 998.128, 1303.169, 1609.195 }, + new[] { 363.05, 669.075, 998.128 + 21.982, 1303.169 + 21.982, 1609.195 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.dWaterLoss, + new[] { 345.039, 651.064, 980.116, 1285.157, 1591.184 }, + new[] { 345.039, 651.064, 980.116 + 21.982, 1285.157 + 21.982, 1591.184 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.w, + new[] { 363.049, 669.074, 974.115, 1303.169, 1609.195 }, + new[] { 363.049, 669.074, 974.115, 1303.169 + 21.982, 1609.195 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.x, + new[] { 347.055, 653.081, 958.122, 1287.174, 1593.2 }, + new[] { 347.055, 653.081, 958.122, 1287.174 + 21.982, 1593.2 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.y, + new[] { 283.084, 589.109, 894.15, 1223.203, 1529.228 }, + new[] { 283.084, 589.109, 894.15, 1223.203 + 21.982, 1529.228 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.z, + new[] { 267.089, 573.124, 878.156, 1207.208, 1513.233 }, + new[] { 267.089, 573.124, 878.156, 1207.208 + 21.982, 1513.233 + 21.982 })] + public void TestFragmentation_Modified(string sequence, string modString, string fullSequence, double unmodifiedMass, double modifiedMass, + ProductType productType, double[] unmodifiedFragmentMass, double[] modifiedFragmentMasses) + { + var mods = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).ToList(); + var modDict = mods.ToDictionary(p => p.IdWithMotif, p => p); + var rna = new RNA(sequence); + + var unmodifiedOligo = new OligoWithSetMods(sequence, new Dictionary(), + 0, new RnaDigestionParams(), rna, 1, rna.Length); + Assert.That(unmodifiedOligo.AllModsOneIsNterminus.Count, Is.EqualTo(0)); + Assert.That(unmodifiedOligo.FullSequence, Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.SequenceWithChemicalFormulas, Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.FullSequenceWithMassShift(), Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.MonoisotopicMass, Is.EqualTo(unmodifiedMass).Within(0.01)); + + var modifiedOligo = new OligoWithSetMods(fullSequence, modDict, + 0, new RnaDigestionParams(), rna, 1, rna.Length); + var formulaSequence = fullSequence.Replace("Metal:Sodium on A", "H-1Na"); + var massShiftSequence = fullSequence.Replace("Metal:Sodium on A", "+21.981944"); + Assert.That(modifiedOligo.AllModsOneIsNterminus.Count, Is.EqualTo(mods.Count)); + Assert.That(modifiedOligo.FullSequence, Is.EqualTo(fullSequence)); + Assert.That(modifiedOligo.SequenceWithChemicalFormulas, Is.EqualTo(formulaSequence)); + Assert.That(modifiedOligo.FullSequenceWithMassShift(), Is.EqualTo(massShiftSequence)); + Assert.That(modifiedOligo.MonoisotopicMass, Is.EqualTo(modifiedMass).Within(0.01)); + + var unmodifiedProducts = unmodifiedOligo.GetNeutralFragments(productType).ToList(); + Assert.That(unmodifiedProducts.Count, Is.EqualTo(5)); + var modifiedProducts = modifiedOligo.GetNeutralFragments(productType).ToList(); + Assert.That(modifiedProducts.Count, Is.EqualTo(5)); + + + for (int i = 0; i < unmodifiedProducts.Count; i++) + { + var unModifedProduct = unmodifiedProducts[i]; + var modifiedProduct = modifiedProducts[i]; + + Assert.That(unModifedProduct.NeutralMass, Is.EqualTo(unmodifiedFragmentMass[i]).Within(0.01)); + Assert.That(modifiedProduct.NeutralMass, Is.EqualTo(modifiedFragmentMasses[i]).Within(0.01)); + } + } + + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestRnaFragments(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + List products = rna.GetNeutralFragments(testCase.Type).Select(p => (Product)p).ToList(); + + for (int i = 0; i < products.Count; i++) + { + var product = products[i]; + Assert.That(testCase.Type, Is.EqualTo(product.ProductType)); + Assert.That(testCase.Type.GetRnaTerminusType(), Is.EqualTo(product.Terminus)); + Assert.That(testCase.NeutralMasses[i], Is.EqualTo(product.NeutralMass).Within(0.01)); + Assert.That(testCase.NeutralMasses[i], Is.EqualTo(product.MonoisotopicMass).Within(0.01)); + Assert.That(0, Is.EqualTo(product.NeutralLoss)); + Assert.That(null, Is.EqualTo(product.SecondaryProductType)); + Assert.That(0, Is.EqualTo(product.SecondaryFragmentNumber)); + + string annotation = $"{product.ProductType}{product.FragmentNumber}"; + Assert.That(annotation, Is.EqualTo(product.Annotation)); + string toString = + $"{product.ProductType}{product.FragmentNumber};{product.NeutralMass:F5}-{product.NeutralLoss:0.##}"; + Assert.That(toString, Is.EqualTo(product.ToString())); + } + } + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestRnaFragmentNumbers(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + List products = rna.GetNeutralFragments(testCase.Type).Select(p => (Product)p).ToList(); + + for (int i = 0; i < products.Count; i++) + { + var product = products[i]; + bool isThreePrime = product.ProductType.GetRnaTerminusType() == FragmentationTerminus.ThreePrime; + + int fragmentNumber = i + 1; + int residuePosition = isThreePrime ? rna.Length - fragmentNumber : fragmentNumber; + + Assert.That(product.FragmentNumber, Is.EqualTo(fragmentNumber)); + Assert.That(product.ResiduePosition, Is.EqualTo(residuePosition)); + } + + } + + [Test] + public void TestConstructorAndEquality() + { + Product product1 = new Product(ProductType.d, FragmentationTerminus.FivePrime, 200, 4, 4, 0.0); + Product product2 = new Product(ProductType.d, FragmentationTerminus.FivePrime, 200, 4, 4, 0.0); + Product uniqueProduct = new Product(ProductType.a, FragmentationTerminus.FivePrime, 201, 4, 4, 0.0); + + Assert.That(product1.Equals(product1)); + Assert.That(product1.Equals(product2)); + Assert.That(product1.GetHashCode(), Is.EqualTo(product2.GetHashCode())); + Assert.That(!product1.Equals(uniqueProduct)); + Assert.That(!product1.Equals(null)); + Assert.That(product1.GetHashCode(), Is.Not.EqualTo(uniqueProduct.GetHashCode())); + + Assert.That(product1.Equals((object)product1)); + Assert.That(product1.Equals((object)product2)); + Assert.That(!product1.Equals((object)uniqueProduct)); + Assert.That(!product1.Equals((object)new Product(ProductType.d, FragmentationTerminus.N, 200, 4, 4, 0.0))); + Assert.That(!product1.Equals((object)null)); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestNucleicAcid.cs b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs new file mode 100644 index 000000000..47e98d708 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs @@ -0,0 +1,174 @@ +using NUnit.Framework.Legacy; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Chemistry; +using Omics.Fragmentation; +using Transcriptomics; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + /// + /// Test Data generated with http://rna.rega.kuleuven.be/masspec/mongo.htm + /// + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestNucleicAcid + { + public record SixmerTestCase(string Sequence, ProductType Type, double[] NeutralMasses, string[] ChemicalFormulas); + + public static IEnumerable GetSixmerIndividualFragmentTypeTestCases() + { + Loaders.LoadElements(); + + yield return new SixmerTestCase("GUACUG", ProductType.a, + new[] { 267.089, 573.114, 902.167, 1207.208, 1513.233 }, + new[] { "C10H13N5O4", "C19H24N7O12P", "C29H36N12O18P2", "C38H48N15O25P3", "C47H59N17O33P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.b, + new[] { 283.084, 589.109, 918.162, 1223.203, 1529.228 }, + new[] { "C10H13N5O5", "C19H24N7O13P", "C29H36N12O19P2", "C38H48N15O26P3", "C47H59N17O34P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.c, + new[] { 347.055, 653.081, 982.133, 1287.174, 1593.2 }, + new[] { "C10H14N5O7P", "C19H25N7O15P2", "C29H37N12O21P3", "C38H49N15O28P4", "C47H60N17O36P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.d, + new[] { 363.05, 669.075, 998.128, 1303.169, 1609.195 }, + new[] { "C10H14N5O8P", "C19H25N7O16P2", "C29H37N12O22P3", "C38H49N15O29P4", "C47H60N17O37P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.dWaterLoss, + new[] { 345.039, 651.064, 980.116, 1285.157, 1591.184 }, + new[] { "C10H12N5O7P", "C19H23N7O15P2", "C29H35N12O21P3", "C38H47N15O28P4", "C47H58N17O36P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.w, + new[] { 363.049, 669.074, 974.115, 1303.169, 1609.195 }, + new[] { "C10H14N5O8P", "C19H25N7O16P2", "C28H37N10O23P3", "C38H49N15O29P4", "C47H60N17O37P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.x, + new[] { 347.055, 653.081, 958.122, 1287.174, 1593.2 }, + new[] { "C10H14N5O7P", "C19H25N7O15P2", "C28H37N10O22P3", "C38H49N15O28P4", "C47H60N17O36P5" }); + yield return new SixmerTestCase("GUACUG", ProductType.y, + new[] { 283.084, 589.109, 894.15, 1223.203, 1529.228 }, + new[] { "C10H13N5O5", "C19H24N7O13P", "C28H36N10O20P2", "C38H48N15O26P3", "C47H59N17O34P4", }); + yield return new SixmerTestCase("GUACUG", ProductType.z, + new[] { 267.089, 573.124, 878.156, 1207.208, 1513.233 }, + new[] { "C10H13N5O4", "C19H24N7O12P", "C28H36N10O19P2", "C38H48N15O25P3", "C47H59N17O33P4", }); + + + yield return new SixmerTestCase("GUACUG", ProductType.aBaseLoss, + new[] { 114.03, 459.07, 765.095, 1094.147, 1399.198 }, + new[] { "C5H6O3", "C15H18N5O10P", "C24H29N7O18P2", "C34H41N12O24P3", "C43H53N15O31P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.bBaseLoss, + new[] { 130.027, 475.074, 781.099, 1110.152, 1415.193 }, + new[] { "C5H6O4", "C15H18N5O11P", "C24H29N7O19P2", "C34H41N12O25P3", "C43H53N15O32P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.cBaseLoss, + new[] { 193.998, 539.045, 845.071, 1174.123, 1479.164 }, + new[] { "C5H7O6P", "C15H19N5O13P2", "C24H30N7O21P3", "C34H42N12O27P4", "C43H54N15O34P5" }); + yield return new SixmerTestCase("GUACUG", ProductType.dBaseLoss, + new[] { 209.993, 555.04, 861.066, 1190.118, 1495.16 }, + new[] { "C5H7O7P", "C15H19N5O14P2", "C24H30N7O22P3", "C34H42N12O28P4", "C43H54N15O35P5" }); + + // TODO: Add water loss besides d-H2O + } + + + [Test] + [TestCase("GUACUG", 1874.281)] + [TestCase("A", 267.096)] + [TestCase("C", 243.085)] + [TestCase("U", 244.069)] + [TestCase("G", 283.091)] + [TestCase("GU", 589.116)] + [TestCase("AAA", 925.200)] + [TestCase("CCC", 853.166)] + [TestCase("UUU", 856.119)] + [TestCase("GGG", 973.185)] + public void TestConstructorsAndEquality(string sequence, double monoMass) + { + // test constructors and equality + RNA rna = new RNA(sequence); + + Assert.That(rna.Length, Is.EqualTo(sequence.Length)); + Assert.That(rna.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.GetChemicalFormula().MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.NucleicAcidArray.Length, Is.EqualTo(sequence.Length)); + CollectionAssert.AreEqual(rna.NucleicAcidArray.Select(p => p.Letter), sequence); + Assert.That(rna.FivePrimeTerminus.Equals(NucleicAcid.DefaultFivePrimeTerminus)); + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + rna.ThreePrimeTerminus = rna.ThreePrimeTerminus; + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + + List nucList = new(); + foreach (var nucleotide in sequence) + { + nucList.Add(Nucleotide.GetResidue(nucleotide)); + } + Assert.That(rna.NucleicAcidArray.SequenceEqual(nucList.ToArray())); + + var rna2 = new RNA(sequence, NucleicAcid.DefaultFivePrimeTerminus, NucleicAcid.DefaultThreePrimeTerminus); + + Assert.That(rna2.Length, Is.EqualTo(sequence.Length)); + Assert.That(rna2.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.FivePrimeTerminus.Equals(NucleicAcid.DefaultFivePrimeTerminus)); + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + nucList.Clear(); + foreach (var nucleotide in sequence) + { + nucList.Add(Nucleotide.GetResidue(nucleotide)); + } + Assert.That(rna.NucleicAcidArray.SequenceEqual(nucList.ToArray())); + + Assert.That(rna.Equals(rna2)); + Assert.That(rna.Equals(rna)); + Assert.That(!rna.Equals(null)); + Assert.That(rna.Equals((object)rna2)); + Assert.That(rna.Equals((object)rna)); + Assert.That(!rna.Equals((object)null)); + Assert.That(!rna.Equals((object)new Double())); + } + + [Test] + public void TestParseSequence() + { + var rna1 = new RNA("GUACUG"); + var rna2 = new RNA("GU ACU G"); + var rna3 = new RNA("GU*ACU*G"); + + Assert.That(rna1.BaseSequence, Is.EqualTo(rna2.BaseSequence)); + Assert.That(rna1.BaseSequence, Is.EqualTo(rna3.BaseSequence)); + Assert.That(rna1.GetHashCode(), Is.EqualTo(rna3.GetHashCode())); + Assert.That(rna1.GetHashCode(), Is.EqualTo(rna3.GetHashCode())); + Assert.That(rna1.Length, Is.EqualTo(rna3.Length)); + Assert.That(rna1.Length, Is.EqualTo(rna3.Length)); + + Assert.Throws(() => new RNA("GUA~CUG")); + } + + [Test] + [TestCase("GUACUG", new[] { -1, -2, -3, -4, -5 }, new[] { 1873.273, 936.133, 623.752, 467.562, 373.848 })] + public void TestElectroSpraySeries(string sequence, int[] charges, double[] mzs) + { + RNA rna = new(sequence); + + var esiSeries = rna.GetElectrospraySeries(charges.First(), charges.Last()).ToArray(); + for (int j = 0; j < mzs.Length; j++) + { + var ion = esiSeries[j]; + Assert.That(ion, Is.EqualTo(mzs[j]).Within(0.01)); + } + } + + [Test] + [TestCase("GUACUG", new[] { -1, -2, -3, -4, -5, -6 }, new[] { 1953.239, 976.116, 650.408, 487.554, 389.841, 324.700 })] + public void TestReplaceTerminusWithElectroSpraySeries(string sequence, int[] charges, double[] mzs) + { + RNA rna = new("GUACUG"); + rna.FivePrimeTerminus = ChemicalFormula.ParseFormula("H1"); + + var esiSeries = rna.GetElectrospraySeries(charges.Last(), charges.First()).ToArray(); + for (int j = 0; j < mzs.Length; j++) + { + var ion = esiSeries[j]; + Assert.That(ion, Is.EqualTo(mzs[j]).Within(0.01)); + } + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestNucleotide.cs b/mzLib/Test/Transcriptomics/TestNucleotide.cs index df250fd40..277ebc3d6 100644 --- a/mzLib/Test/Transcriptomics/TestNucleotide.cs +++ b/mzLib/Test/Transcriptomics/TestNucleotide.cs @@ -9,12 +9,12 @@ namespace Test.Transcriptomics { [ExcludeFromCodeCoverage] - internal class TestNucleotide + public class TestNucleotide { - internal record NucleotideTestCase(Nucleotide Nucleotide, string Name, char OneLetterCode, string Symbol, ChemicalFormula Formula, double Mass, + public record NucleotideTestCase(Nucleotide Nucleotide, string Name, char OneLetterCode, string Symbol, ChemicalFormula Formula, double Mass, ChemicalFormula nucleosideFormula); - internal static IEnumerable GetNucleotideTestCases() + public static IEnumerable GetNucleotideTestCases() { Loaders.LoadElements(); diff --git a/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs b/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs new file mode 100644 index 000000000..6255ef2fd --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs @@ -0,0 +1,79 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework; +using Omics.Modifications; +using Transcriptomics.Digestion; +using Transcriptomics; + +namespace Test.Transcriptomics +{ + [ExcludeFromCodeCoverage] + public static class TestOligoWithSetMods + { + [Test] + [TestCase( 0, 1, 20.45)] + [TestCase(1, 1, 20.45)] + [TestCase( 0, 2, 20.45)] + [TestCase(1, 2, 20.45)] + [TestCase( 0, 5, 28.37)] + [TestCase(1, 5, 28.37)] + [TestCase( 0, 6, 28.37)] + [TestCase(1, 6, 28.37)] + public static void TestLocalize(int modsOnOligo, int indexOfMass, double massToLocalize) + { + var oligoWithSetMods = new RNA("GUACUG", + oneBasedPossibleLocalizedModifications: new Dictionary> { { 4, [TestDigestion.PotassiumAdducts[1]] } }) + .Digest(new RnaDigestionParams(), [], []) + .ElementAt(modsOnOligo); + + Assert.That(oligoWithSetMods.AllModsOneIsNterminus.Count, Is.EqualTo(modsOnOligo)); + + // Act + var localizedOligo = oligoWithSetMods.Localize(indexOfMass - 2, massToLocalize); + + // Assert + int expectedModificationCount; + double expectedMass; + if (modsOnOligo == 1) // if the oligo started with a mod + { + int indexOfOriginalMod = oligoWithSetMods.AllModsOneIsNterminus.Keys.First(); + + // ensure original modification exist + Assert.That(localizedOligo.AllModsOneIsNterminus.ContainsKey(indexOfOriginalMod)); + + if (indexOfOriginalMod != indexOfMass) // Additional mass was added to a different location + { + expectedModificationCount = modsOnOligo + 1; + expectedMass = massToLocalize; + + // ensure original modification is still intact + Assert.That(oligoWithSetMods.OneBasedPossibleLocalizedModifications[indexOfOriginalMod][0].MonoisotopicMass, + Is.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfOriginalMod].MonoisotopicMass)); + } + else // Additional mass was added to the location of an existing modification + { + expectedModificationCount = modsOnOligo; + expectedMass = massToLocalize + TestDigestion.PotassiumAdducts[1].MonoisotopicMass!.Value; + + // ensure original modification has been altered + Assert.That(oligoWithSetMods.OneBasedPossibleLocalizedModifications[indexOfOriginalMod][0].MonoisotopicMass, + Is.Not.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfOriginalMod].MonoisotopicMass)); + } + } + else // oligo started with no modifications + { + expectedModificationCount = modsOnOligo + 1; + expectedMass = massToLocalize; + } + + + Assert.That(expectedModificationCount, Is.EqualTo(localizedOligo.AllModsOneIsNterminus.Count)); + Assert.That(localizedOligo.AllModsOneIsNterminus.ContainsKey(indexOfMass)); + Assert.That(expectedMass, Is.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfMass].MonoisotopicMass)); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestProductType.cs b/mzLib/Test/Transcriptomics/TestProductType.cs new file mode 100644 index 000000000..15757f4d2 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestProductType.cs @@ -0,0 +1,278 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Chemistry; +using MassSpectrometry; +using NUnit.Framework; +using NUnit.Framework.Legacy; +using Omics.Fragmentation; +using Omics.Fragmentation.Oligo; +using Omics.Modifications; +using Transcriptomics; +using Transcriptomics.Digestion; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestProductType + { + [Test] + [TestCase(DissociationType.HCD, new[] { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, + ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M })] + [TestCase(DissociationType.CID, new[] { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, + ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M })] + public void TestProductTypes_Dissociation(DissociationType dissociation, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, dissociation.GetRnaProductTypesFromDissociationType()); + } + + [Test] + [TestCase(FragmentationTerminus.FivePrime, new[] + { + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + })] + [TestCase(FragmentationTerminus.ThreePrime, new[] + { + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + })] + public void TestProductTypes_Terminus(FragmentationTerminus terminus, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, terminus.GetRnaTerminusSpecificProductTypes()); + } + + [Test] + [TestCase(DissociationType.HCD, FragmentationTerminus.FivePrime, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, })] + [TestCase(DissociationType.HCD, FragmentationTerminus.ThreePrime, new[] + { ProductType.w, ProductType.x, ProductType.y, ProductType.z, })] + [TestCase(DissociationType.HCD, FragmentationTerminus.Both, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M })] + [TestCase(DissociationType.CID, FragmentationTerminus.FivePrime, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss })] + [TestCase(DissociationType.CID, FragmentationTerminus.ThreePrime, new[] + { ProductType.w, ProductType.y, ProductType.yWaterLoss })] + [TestCase(DissociationType.CID, FragmentationTerminus.Both, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M })] + public void TestProductTypes_TerminusAndDissociation(DissociationType dissociation, FragmentationTerminus terminus, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, dissociation.GetRnaTerminusSpecificProductTypesFromDissociation(terminus)); + } + + [Test] + public static void Test_NeutralMassShiftFromProductType() + { + foreach (ProductType p in Enum.GetValues(typeof(ProductType))) + { + double mass = 0; + switch (p) + { + case ProductType.a: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.b: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("OH").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.c: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3H2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.x: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-1H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.y: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-3P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.zWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-5H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.aWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-1O-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.aBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.bBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O1H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.cWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.cBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3H-1P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.d: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O4H2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.dWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.dBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O4H-1P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + + case ProductType.w: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.wWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-1O-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.xWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-2H-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.yWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.z: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.wBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.xBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-1H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.yBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-3H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.zBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4H-3P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + } + } + } + + [Test] + public void TestProductTypes_GetRnaTerminusType() + { + foreach (var type in Enum.GetValues()) + { + switch (type) + { + case ProductType.a: + case ProductType.aWaterLoss: + case ProductType.aBaseLoss: + case ProductType.b: + case ProductType.bWaterLoss: + case ProductType.bBaseLoss: + case ProductType.c: + case ProductType.cWaterLoss: + case ProductType.cBaseLoss: + case ProductType.d: + case ProductType.dWaterLoss: + case ProductType.dBaseLoss: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.FivePrime)); + break; + + case ProductType.w: + case ProductType.wWaterLoss: + case ProductType.wBaseLoss: + case ProductType.x: + case ProductType.xWaterLoss: + case ProductType.xBaseLoss: + case ProductType.y: + case ProductType.yWaterLoss: + case ProductType.yBaseLoss: + case ProductType.z: + case ProductType.zWaterLoss: + case ProductType.zBaseLoss: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.ThreePrime)); + break; + + case ProductType.M: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.Both)); + break; + + case ProductType.aStar: + case ProductType.bAmmoniaLoss: + case ProductType.D: + case ProductType.Ycore: + case ProductType.Y: + case ProductType.aDegree: + case ProductType.yAmmoniaLoss: + case ProductType.zPlusOne: + case ProductType.zDot: + Assert.Throws(() => type.GetRnaTerminusType()); + break; + default: + throw new ArgumentOutOfRangeException(); + } + } + } + + [Test] + [TestCase(ProductType.a, ProductType.aWaterLoss)] + [TestCase(ProductType.b, ProductType.bWaterLoss)] + [TestCase(ProductType.c, ProductType.cWaterLoss)] + [TestCase(ProductType.d, ProductType.dWaterLoss)] + [TestCase(ProductType.w, ProductType.wWaterLoss)] + [TestCase(ProductType.x, ProductType.xWaterLoss)] + [TestCase(ProductType.y, ProductType.yWaterLoss)] + [TestCase(ProductType.z, ProductType.zWaterLoss)] + public void EnsureWaterLossMassesAreCorrect(ProductType normal, ProductType waterLoss) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + List normalFragments = rna.GetNeutralFragments(normal).ToList(); + List waterLossFragments = rna.GetNeutralFragments(waterLoss).ToList(); + for (var index = 0; index < waterLossFragments.Count; index++) + { + var waterLossFragment = waterLossFragments[index]; + var normalFragment = normalFragments[index]; + var watermass = 2 * Constants.ProtonMass + PeriodicTable.GetElement("O").PrincipalIsotope.AtomicMass; + + Assert.That(normalFragment.MonoisotopicMass, Is.EqualTo(waterLossFragment.MonoisotopicMass + watermass).Within(0.01)); + } + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestRnase.cs b/mzLib/Test/Transcriptomics/TestRnase.cs index 2657cb08e..b122f32bd 100644 --- a/mzLib/Test/Transcriptomics/TestRnase.cs +++ b/mzLib/Test/Transcriptomics/TestRnase.cs @@ -1,16 +1,13 @@ using NUnit.Framework; -using System; -using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.IO; -using System.Linq; using Proteomics.ProteolyticDigestion; using Transcriptomics.Digestion; namespace Test.Transcriptomics { [ExcludeFromCodeCoverage] - internal class TestRnase + public class TestRnase { public static string rnaseTsvpath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"Digestion\rnases.tsv"); @@ -18,7 +15,7 @@ internal class TestRnase public void TestRnaseDictionaryLoading() { var rnaseCountFromTsv = File.ReadAllLines(rnaseTsvpath).Length - 1; - Assert.AreEqual(RnaseDictionary.Dictionary.Count, rnaseCountFromTsv); + Assert.That(RnaseDictionary.Dictionary.Count, Is.EqualTo(rnaseCountFromTsv)); } [Test] diff --git a/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs b/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs new file mode 100644 index 000000000..cd6301a91 --- /dev/null +++ b/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs @@ -0,0 +1,67 @@ +using FlashLFQ; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace TestFlashLFQ +{ + public class ChromatographicPeakTests + { + private ChromatographicPeak CreateChromatographicPeak() + { + // Create a sample SpectraFileInfo + SpectraFileInfo spectraFileInfo = new SpectraFileInfo("sampleFile", "A", 1, 1, 1); + + // Create a sample Identification + Identification identification = new Identification(spectraFileInfo, "MPEPTIDE", "M[Oxidation]PEPTIDE", 100, 10, 2, new List()); + + // Create a ChromatographicPeak instance + ChromatographicPeak chromatographicPeak = new ChromatographicPeak(identification, false, spectraFileInfo); + + IndexedMassSpectralPeak peak1 = new IndexedMassSpectralPeak(100, 300, 1, 9.5); + IndexedMassSpectralPeak peak2 = new IndexedMassSpectralPeak(100, 300, 1, 10.5); + + // Add sample IsotopicEnvelopes + chromatographicPeak.IsotopicEnvelopes = new List() + { + new IsotopicEnvelope(peak1, 2, 300, 1), + new IsotopicEnvelope(peak2, 2, 300, 1) + }; + + return chromatographicPeak; + } + + + [Test] + public void TestResolveIdentifications() + { + // Arrange + ChromatographicPeak chromatographicPeak = CreateChromatographicPeak(); + + // Act + chromatographicPeak.ResolveIdentifications(); + + // Assert + Assert.AreEqual(1, chromatographicPeak.NumIdentificationsByBaseSeq); + Assert.AreEqual(1, chromatographicPeak.NumIdentificationsByFullSeq); + } + + [Test] + public void TestToString() + { + // Arrange + ChromatographicPeak chromatographicPeak = CreateChromatographicPeak(); + + // Act + string result = chromatographicPeak.ToString(); + + // Assert + string expected = "sampleFile\tMPEPTIDE\tM[Oxidation]PEPTIDE\t\t\t100\t10\t2\t51.007276466879\t0\t-\t-\t-\t-\t-\t0\tMSMS\t\t\t1\t1\t1\t0\tNaN\tFalse\tFalse"; + Assert.AreEqual(expected, result); + } + } +} diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index b94e32cf0..ed72ef077 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -5,12 +5,13 @@ using MathNet.Numerics.Statistics; using MzLibUtil; using NUnit.Framework; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using CollectionAssert = NUnit.Framework.Legacy.CollectionAssert; using Proteomics.AminoAcidPolymer; using System; using System.Collections.Generic; using System.IO; using System.Linq; -using Easy.Common.Extensions; using Test.FileReadingTests; using UsefulProteomicsDatabases; using ChromatographicPeak = FlashLFQ.ChromatographicPeak; @@ -426,12 +427,18 @@ public static void TestFlashLfqNormalization() var id3 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List { pg }); var id4 = new Identification(mzml2, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List { pg }); - results = new FlashLfqEngine(new List { id1, id2, id3, id4 }, normalize: true).Run(); + results = new FlashLfqEngine(new List { id1, id2, id3, id4 }, normalize: true, integrate: false).Run(); int int7 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw2)); int int8 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml2)); Assert.That(int7 > 0); Assert.That(int7 == int8); + + results.ReNormalizeResults(true); + int int9 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw2)); + int int10 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml2)); + Assert.That(int9 > int7); + Assert.That(int9, Is.EqualTo(int10).Within(1)); } [Test] @@ -486,15 +493,15 @@ public static void TestFlashLfqMergeResults() public static void TestFlashLfqMatchBetweenRuns() { List filesToWrite = new List { "mzml_1", "mzml_2" }; - List pepSequences = new List - { - "PEPTIDE", - "PEPTIDEV", - "PEPTIDEVV", + List pepSequences = new List + { + "PEPTIDE", + "PEPTIDEV", + "PEPTIDEVV", "TARGETPEP", "PEPTIDEVVV", - "PEPTIDEVVVV", - "PEPTIDEVVVVA", + "PEPTIDEVVVV", + "PEPTIDEVVVVA", "PEPTIDEVVVVAA" }; double intensity = 1e6; @@ -592,9 +599,6 @@ public static void TestFlashLfqMatchBetweenRuns() FlashLfqEngine engine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true); FlashLfqEngine interquartileEngine = new FlashLfqEngine( new List { id1, id2, id3, id4, id5, id11, id12, id6, id7, id9, id10, id13, id14 }, matchBetweenRuns: true); - FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true, - peptideSequencesToUse: pepSequences); - //run the engine var results = engine.Run(); @@ -608,44 +612,33 @@ public static void TestFlashLfqMatchBetweenRuns() Assert.That(peak.Intensity > 0); Assert.That(peak.Intensity == otherFilePeak.Intensity); - Assert.That(peak.RtHypothesis.HasValue); - Assert.That(peak.RtHypothesis, Is.EqualTo(1.03).Within(0.01)); List rtDiffs = new(); for (int i = 0; i < 5; i++) { if (i == 2) continue; // exclude the mbr peak from the calculation rtDiffs.Add(Math.Abs(file1Rt[i] - file2Rt[i])); } - Assert.That(peak.RtStdDev.HasValue); - Assert.That(!peak.RtInterquartileRange.HasValue); - Assert.That(peak.RtStdDev, Is.EqualTo(rtDiffs.StandardDeviation()).Within(0.01)); Assert.That(results.Peaks[file1].Count == 5); Assert.That(!results.Peaks[file1].Any(p => p.IsMbrPeak)); - Assert.That(!results.Peaks[file1].Any(p => p.RtHypothesis.HasValue)); results = interquartileEngine.Run(); peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); - Assert.That(peak.RtHypothesis.HasValue); - Assert.That(peak.RtHypothesis, Is.EqualTo(1.04).Within(0.01)); for (int i = 0; i < 5; i++) { if (i == 2) continue; // exclude the mbr peak from the calculation rtDiffs.Add(Math.Abs(file1Rt[i] - file2Rt[i])); } - Assert.That(!peak.RtStdDev.HasValue); - Assert.That(peak.RtInterquartileRange.HasValue); - Assert.That(peak.RtInterquartileRange, Is.EqualTo(rtDiffs.InterquartileRange()).Within(0.01)); + FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true, peptideSequencesToQuantify: pepSequences, donorCriterion: DonorCriterion.Intensity); // The ambiguous engine tests that a non-confident ID (i.e., a PSM that didn't make the peptide level fdr cutoff) - // gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwriteen by confident MS2 ids + // gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwritten by confident MS2 ids results = engineAmbiguous.Run(); Assert.False(results.PeptideModifiedSequences.Select(kvp => kvp.Key).Contains("DECOYPEP")); Assert.False(results.Peaks[file1].Any(peak => peak.Identifications.Any(id => id.ModifiedSequence.Contains("DECOYPEP")))); Assert.That(results.Peaks[file2].Any(peak => peak.Identifications.First().ModifiedSequence == "TARGETPEP")); Assert.AreEqual(results.Peaks[file2].Count(peak => peak.IsMbrPeak), 2); - } [Test] @@ -1046,7 +1039,7 @@ public static void TestMatchBetweenRunsWithNoIdsInCommon() FlashLfqEngine engine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true); var results = engine.Run(); - // no assertions - just don't crash + Assert.Pass();// no assertions - just don't crash } [Test] @@ -1217,7 +1210,11 @@ public static void TestFlashLfqQoutputRealData() } } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, useSharedPeptidesForProteinQuant: true, maxThreads: -1); + var engine = new FlashLfqEngine(ids, + matchBetweenRuns: true, + requireMsmsIdInCondition: false, + useSharedPeptidesForProteinQuant: true, + maxThreads: -1); var results = engine.Run(); results.WriteResults(Path.Combine(outputDirectory,"peaks.tsv"), Path.Combine(outputDirectory, "peptides.tsv"), Path.Combine(outputDirectory, "proteins.tsv"), Path.Combine(outputDirectory, "bayesian.tsv"),true); @@ -1229,8 +1226,8 @@ public static void TestFlashLfqQoutputRealData() Assert.AreEqual(4, peaks[0].Count(m => m.IsMbrPeak == false)); Assert.AreEqual(5, peaks[1].Count(m => m.IsMbrPeak == false)); - CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "Q7KZF4", "P52298", "Q15149" }, peaks[0].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); - CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q7KZF4", "Q7KZF4", "P52298" }, peaks[1].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "Q7KZF4", "P52298", "Q15149", "Q15149" }, peaks[0].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q15149", "Q7KZF4", "Q7KZF4", "P52298" }, peaks[1].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); Assert.AreEqual(6, peptides.Count); CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q15149", "Q7KZF4", "P52298" }, peptides.Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); @@ -1354,6 +1351,7 @@ public static void RealDataMbrTest() double rt = double.Parse(split[2]); int z = (int)double.Parse(split[6]); var proteins = split[24].Split(new char[] { '|' }); + bool decoyPeptide = split[39].Equals("D"); List proteinGroups = new List(); foreach (var protein in proteins) { @@ -1368,65 +1366,62 @@ public static void RealDataMbrTest() } } - Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups); + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: decoyPeptide); ids.Add(id); } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 1); + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 1, matchBetweenRunsFdrThreshold: 0.15, maxMbrWindow: 1); var results = engine.Run(); + // Count the number of MBR results in each file var f1r1MbrResults = results .PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MBR && p.Value.GetDetectionType(f1r2) == DetectionType.MSMS).ToList(); - - Assert.That(f1r1MbrResults.Count >= 132); - - var f1r2MbrResults = results.PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MSMS && p.Value.GetDetectionType(f1r2) == DetectionType.MBR).ToList(); - - Assert.That(f1r2MbrResults.Count >= 77); - - List<(double, double)> peptideIntensities = new List<(double, double)>(); + .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MBR && p.Value.GetDetectionType(f1r2) == DetectionType.MSMS) + .ToList(); + var f1r2MbrResults = results + .PeptideModifiedSequences + .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MSMS && p.Value.GetDetectionType(f1r2) == DetectionType.MBR) + .ToList(); - foreach (var peptide in f1r1MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(f1r1)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(f1r2)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } + // Due to the small number of results in the test data, the counts and correlation values can be quite variable. + // Any change to ML.NET or the PEP Analysis engine will cause these to change. + Console.WriteLine("r1 PIP event count: " + f1r1MbrResults.Count); + Console.WriteLine("r2 PIP event count: " + f1r2MbrResults.Count); + Assert.AreEqual(138, f1r1MbrResults.Count); + Assert.AreEqual(70, f1r2MbrResults.Count); - double corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.That(corr > 0.8); + // Check that MS/MS identified peaks and MBR identified peaks have similar intensities + List<(double, double)> peptideIntensities = f1r1MbrResults.Select(pep => (Math.Log(pep.Value.GetIntensity(f1r1)), Math.Log(pep.Value.GetIntensity(f1r2)))).ToList(); + double corrRun1 = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - peptideIntensities.Clear(); - foreach (var peptide in f1r2MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(f1r2)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(f1r1)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } - - corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); + peptideIntensities = f1r2MbrResults.Select(pep => (Math.Log(pep.Value.GetIntensity(f1r1)), Math.Log(pep.Value.GetIntensity(f1r2)))).ToList(); + double corrRun2 = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.That(corr > 0.7); + // These values are also sensitive, changes can cause them to dip as low as 0.6 (specifically the corrRun2 value) + Console.WriteLine("r1 correlation: " + corrRun1); + Console.WriteLine("r2 correlation: " + corrRun2); + Assert.Greater(corrRun1, 0.75); + Assert.Greater(corrRun2, 0.65); // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein // has to be observed in a condition for match-between-runs f1r1.Condition = "b"; - engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1); + engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 5); results = engine.Run(); - var proteinsObservedInF1 = ids.Where(p => p.FileInfo == f1r1).SelectMany(p => p.ProteinGroups).Distinct().ToList(); - var proteinsObservedInF2 = ids.Where(p => p.FileInfo == f1r2).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF1 = ids.Where(id => !id.IsDecoy).Where(p => p.FileInfo == f1r1).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF2 = ids.Where(id => !id.IsDecoy).Where(p => p.FileInfo == f1r2).SelectMany(p => p.ProteinGroups).Distinct().ToList(); var proteinsObservedInF1ButNotF2 = proteinsObservedInF1.Except(proteinsObservedInF2).ToList(); foreach (ProteinGroup protein in proteinsObservedInF1ButNotF2) { Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(f1r2) == 0); } - List peptidesToUse = ids.Select(id => id.ModifiedSequence).Take(400).Distinct().ToList(); - engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, peptideSequencesToUse: peptidesToUse); + // Test that no decoys are reported in the final resultsw + Assert.AreEqual(0, ids.Where(id => id.IsDecoy).Count(id => results.ProteinGroups.ContainsKey(id.ProteinGroups.First().ProteinGroupName))); + + List peptidesToUse = ids.Where(id => id.QValue <= 0.007 & !id.IsDecoy).Select(id => id.ModifiedSequence).Distinct().ToList(); + engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, matchBetweenRunsFdrThreshold: 0.5, maxMbrWindow: 1, peptideSequencesToQuantify: peptidesToUse); results = engine.Run(); - var test = results.PeptideModifiedSequences.Select(kvp => !peptidesToUse.Contains(kvp.Key)).ToList(); CollectionAssert.AreEquivalent(results.PeptideModifiedSequences.Select(kvp => kvp.Key), peptidesToUse); } @@ -1661,14 +1656,13 @@ public static void TestAmbiguousFraction() peak1.ResolveIdentifications(); peak2.ResolveIdentifications(); - peak1.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 1000)); - peak2.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 10000)); + peak1.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 1000, 1)); + peak2.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 10000, 1)); peak1.CalculateIntensityForThisFeature(false); peak2.CalculateIntensityForThisFeature(false); - FlashLfqResults res = new FlashLfqResults(new List { fraction1, fraction2 }, new List { id1, id2, id3 }, - new HashSet { "peptide1", "peptide2"}); + FlashLfqResults res = new FlashLfqResults(new List { fraction1, fraction2 }, new List { id1, id2, id3 }); res.Peaks[fraction1].Add(peak1); res.Peaks[fraction2].Add(peak2); res.CalculatePeptideResults(quantifyAmbiguousPeptides: false); diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.csproj b/mzLib/TestFlashLFQ/TestFlashLFQ.csproj index 06d014d2c..aa4f19039 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.csproj +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.csproj @@ -1,7 +1,7 @@  - net6.0-windows + net8.0-windows false x64 @@ -13,9 +13,10 @@ - - - + + + + diff --git a/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs b/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs new file mode 100644 index 000000000..cd6412731 --- /dev/null +++ b/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs @@ -0,0 +1,90 @@ +using NUnit.Framework; +using Readers; +using System.Collections.Generic; +using System.Linq; +using FlashLFQ; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.IO; + +namespace TestFlashLFQ +{ + internal class TestIdentificationAdapter + { + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")] + public void TestAddProteinGroupInfoCorrect(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerPsmFile file = new MsFraggerPsmFile(filePath); + + List identifications = new List(); + identifications = MzLibExtensions.MakeIdentifications(file); + + // list should contain five elements + Assert.That(identifications.Count, Is.EqualTo(5)); + // one protein associated with given results, list should only contain this one element + Assert.That(identifications[0].ProteinGroups.Count, Is.EqualTo(1)); + // two proteins associated with given results, list should contain two elements + Assert.That(identifications[2].ProteinGroups.Count, Is.EqualTo(2)); + + Identification identification1= identifications[0]; + Assert.That(identification1.BaseSequence, Is.EqualTo("KPVGAAK")); + Assert.That(identification1.ModifiedSequence, Is.EqualTo("KPVGAAK")); + Assert.That(identification1.Ms2RetentionTimeInMinutes, Is.EqualTo(1.9398)); + Assert.That(identification1.MonoisotopicMass, Is.EqualTo(669.4173)); + Assert.That(identification1.PrecursorChargeState, Is.EqualTo(2)); + + HashSet proteinGroups = identification1.ProteinGroups; + ProteinGroup proteinGroup1 = proteinGroups.First(); + Assert.That(proteinGroup1.ProteinGroupName, Is.EqualTo("P16403")); + Assert.That(proteinGroup1.GeneName, Is.EqualTo("H12")); + Assert.That(proteinGroup1.Organism, Is.EqualTo("HUMAN")); + + Identification identification5 = identifications[4]; + Assert.That(identification5.BaseSequence, Is.EqualTo("VVTHGGR")); + Assert.That(identification5.ModifiedSequence, Is.EqualTo("VVTHGGR")); + Assert.That(identification5.Ms2RetentionTimeInMinutes, Is.EqualTo(19.114)); + Assert.That(identification5.MonoisotopicMass, Is.EqualTo(724.398)); + Assert.That(identification5.PrecursorChargeState, Is.EqualTo(2)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv")] + public void TestFileNametoFilePath(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerPsmFile file = new MsFraggerPsmFile(filePath); + string fileName = file.First().FileName; + + List fullFilePath = new List(); + string fullFilePath1 = @"D:\Projects\Chimeras\Mann_11cell_analysis\RawData\interact-20100611_Velos1_TaGe_SA_Hela_1.raw"; + string fullFilePath2 = @"FileReadingTests\ExternalFileTypes\FraggerPsm_FragPipev21.1_psm.tsv"; + fullFilePath.Add(fullFilePath1); + fullFilePath.Add(fullFilePath2); + + Dictionary allFiles = file.FileNameToFilePath(fullFilePath); + + Assert.That(allFiles.TryGetValue(fileName, out var output)); + Assert.AreEqual(output, fullFilePath1); + Assert.That(!allFiles.ContainsValue(fullFilePath2)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\SmallCalibratibleYeastFragger_psm.tsv")] + public void TestFileNametoFilePathLocalPath(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerPsmFile file = new MsFraggerPsmFile(filePath); + string fileName = file.First().FileName; + + List fullFilePath = new List(); + string rawFilePath = @"DataFiles\SmallCalibratibleYeast.mzml"; + fullFilePath.Add(rawFilePath); + + Dictionary allFiles = file.FileNameToFilePath(fullFilePath); + + Assert.That(allFiles.TryGetValue(fileName, out var output)); + Assert.AreEqual(output, rawFilePath); + } + } +} \ No newline at end of file diff --git a/mzLib/TestFlashLFQ/TestPipEcho.cs b/mzLib/TestFlashLFQ/TestPipEcho.cs new file mode 100644 index 000000000..0d2388142 --- /dev/null +++ b/mzLib/TestFlashLFQ/TestPipEcho.cs @@ -0,0 +1,313 @@ +using NUnit.Framework; +using Readers; +using System.Collections.Generic; +using System.Linq; +using FlashLFQ; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.IO; +using FlashLFQ.PEP; +using System; +using Chemistry; +using MassSpectrometry; +using MzLibUtil; +using Test.FileReadingTests; +using UsefulProteomicsDatabases; + + +namespace TestFlashLFQ +{ + [TestFixture] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] + public class TestPipEcho + { + [Test] + [TestCase(3)] + [TestCase(5)] + public static void TestDonorGroupEqualizer(int numGroups) + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + + ChromatographicPeak targetPeak = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + ChromatographicPeak decoyPeak = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: true); + targetPeak.MbrScore = 100; + + Random random = new Random(42); + List donorGroups = new List(); + for (int i = 0; i < 10000; i++) + { + int numberTargets = random.Next(0, 10); + int numberDecoys = random.Next(0, 10); + donorGroups.Add(new DonorGroup(id, Enumerable.Repeat(targetPeak, numberTargets).ToList(), Enumerable.Repeat(decoyPeak, numberDecoys).ToList())); + } + + donorGroups = PepAnalysisEngine.OrderDonorGroups(donorGroups); + var donorIndices = PepAnalysisEngine.GetDonorGroupIndices(donorGroups, numGroups: numGroups, scoreCutoff: 50); + + Assert.That(donorIndices.Count, Is.EqualTo(numGroups)); + List targetPeakCounts = new(); + List decoyPeakCounts = new(); + for (int i = 0; i < numGroups; i++) + { + int targetSum = 0; + int decoySum = 0; + foreach (int idx in donorIndices[i]) + { + targetSum += donorGroups[idx].TargetAcceptors.Count; + decoySum += donorGroups[idx].DecoyAcceptors.Count; + } + targetPeakCounts.Add(targetSum); + decoyPeakCounts.Add(decoySum); + } + + // Assert that each group has an approximately equal number of target peaks + Assert.That(targetPeakCounts.Max() - targetPeakCounts.Min(), Is.LessThanOrEqualTo(numGroups-1)); + // Assert that each group has an approximately equal number of decoy peaks + Assert.That(decoyPeakCounts.Max() - decoyPeakCounts.Min(), Is.LessThanOrEqualTo(numGroups - 1)); + } + + [Test] + public static void TestMbrScorer() + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + SpectraFileInfo fakeDonorFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + + double idMass = 669.4173; + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification id2 = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification donorId = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + id.PeakfindingMass = idMass; + id2.PeakfindingMass = idMass; + donorId.PeakfindingMass = idMass; + + var peak1 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var peak2 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var peak3 = new ChromatographicPeak(id2, isMbrPeak: false, fakeFile, randomRt: false); + var peak4 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var donorPeak = new ChromatographicPeak(donorId, isMbrPeak: false, fakeDonorFile, randomRt: false); + var acceptorPeak = new ChromatographicPeak(donorId, isMbrPeak: true, fakeFile, randomRt: false); + + IndexedMassSpectralPeak imsPeak = new IndexedMassSpectralPeak((idMass + 0.001).ToMz(1), 1.1, 1, 25); + IndexedMassSpectralPeak imsPeak2 = new IndexedMassSpectralPeak((idMass - 0.001).ToMz(1), 1, 2, 26); + var iso1 = new FlashLFQ.IsotopicEnvelope(imsPeak, 1, 1, 0.98); + var iso2 = new FlashLFQ.IsotopicEnvelope(imsPeak2, 1, 1, 0.9); + + peak1.IsotopicEnvelopes.Add(iso1); + peak1.IsotopicEnvelopes.Add(iso2); + peak1.CalculateIntensityForThisFeature(false); + + peak4.IsotopicEnvelopes.Add(iso2); + peak4.CalculateIntensityForThisFeature(false); + + donorPeak.IsotopicEnvelopes.Add(iso2); + donorPeak.CalculateIntensityForThisFeature(false); + + acceptorPeak.IsotopicEnvelopes.Add(iso1); + acceptorPeak.CalculateIntensityForThisFeature(false); + + + var peakList = new List { peak1, peak4 }; + var peakDict = peakList.ToDictionary(keySelector: p => p.Apex.IndexedPeak, elementSelector: p => p); + + // Builds a scorer. Ppm Error and Intensity distributions both have mean and std-dev of 1 + MbrScorer scorer = new MbrScorer(peakDict, peakList, new MathNet.Numerics.Distributions.Normal(1, 1), new MathNet.Numerics.Distributions.Normal(1,1)); + + scorer.AddRtPredErrorDistribution(fakeDonorFile, new List { 0.5, 0.6, 0.5, 0.6, 0.5, 0.6, 0.5 }, 2); + + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, predictedRt: 25.1); + + Assert.That(acceptorPeak.MbrScore, Is.EqualTo(58.7).Within(0.1)); + Assert.That(acceptorPeak.PpmScore, Is.EqualTo(0.62).Within(0.01)); + Assert.That(acceptorPeak.IntensityScore, Is.EqualTo(0.32).Within(0.01)); + Assert.That(acceptorPeak.RtScore, Is.EqualTo(0.96).Within(0.01)); + Assert.That(acceptorPeak.ScanCountScore, Is.EqualTo(0.5).Within(0.01)); + Assert.That(acceptorPeak.IsotopicDistributionScore, Is.EqualTo(0.74).Within(0.01)); + } + + [Test] + public static void TestSpectraFileInfoString() + { + SpectraFileInfo fakeFile = new SpectraFileInfo(@"C:\Users\xyz\data\fakeFile.raw", "A", 1, 1, 1); + Assert.AreEqual("fakeFile.raw", fakeFile.ToString()); + } + + [Test] + public static void TestChromatographicPeakEquals() + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification id2 = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + + var peak1 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + var peak2 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + var peak3 = new ChromatographicPeak(id2, isMbrPeak: true, fakeFile, randomRt: false); + var peak4 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + + IndexedMassSpectralPeak imsPeak = new IndexedMassSpectralPeak(1, 1, 1, 25); + IndexedMassSpectralPeak imsPeak2 = new IndexedMassSpectralPeak(1, 1, 1, 50); + var iso1 = new FlashLFQ.IsotopicEnvelope(imsPeak, 1, 1, 1); + var iso2 = new FlashLFQ.IsotopicEnvelope(imsPeak2, 1, 1, 1); + + peak1.IsotopicEnvelopes.Add(iso1); + peak1.CalculateIntensityForThisFeature(false); + + peak2.IsotopicEnvelopes.Add(iso1); + peak2.CalculateIntensityForThisFeature(false); + + peak3.IsotopicEnvelopes.Add(iso1); + peak3.CalculateIntensityForThisFeature(false); + + peak4.IsotopicEnvelopes.Add(iso2); + peak4.CalculateIntensityForThisFeature(false); + + Assert.That(peak1.Equals(peak2)); + Assert.That(!peak1.Equals(peak3)); + Assert.That(!peak1.Equals(peak4)); + + } + + /// + /// This test MatchBetweenRuns by creating two fake mzML files and a list of fake IDs. + /// There are multiple sets of IDs, where most are shared between the two runs but one+ is/are missing + /// MBR is tested by ensuring that IDs are transferred between runs + /// + [Test] + public static void TestFlashLfqMatchBetweenRunsNearestNeighborDonors() + { + List filesToWrite = new List { "mzml_1", "mzml_2", "mzml_3" }; + List pepSequences = new List + { + "PEPTIDE", + "PEPTIDEV", + "PEPTIDEVV", + "TARGETPEP", + "PEPTIDEVVV", + "PEPTIDEVVVV", + "PEPTIDEVVVVA", + "PEPTIDEVVVVAA" + }; + double intensity = 1e6; + + double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.033, 1.035, 1.04, 1.045, 1.05 }; + double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.031, 1.035, 1.04, 1.055, 1.07 }; + + Loaders.LoadElements(); + + // generate mzml files (5 peptides each) + for (int f = 0; f < filesToWrite.Count; f++) + { + // 1 MS1 scan per peptide + MsDataScan[] scans = new MsDataScan[8]; + + for (int p = 0; p < pepSequences.Count; p++) + { + ChemicalFormula cf = new Proteomics.AminoAcidPolymer.Peptide(pepSequences[p]).GetChemicalFormula(); + IsotopicDistribution dist = IsotopicDistribution.GetDistribution(cf, 0.125, 1e-8); + double[] mz = dist.Masses.Select(v => v.ToMz(1)).ToArray(); + double[] intensities = dist.Intensities.Select(v => v * intensity).ToArray(); + if(f == 2) + { + // Make file 3 the most intense + intensities = intensities.Select(v => v * 5).ToArray(); + } + double rt; + if (f == 1) + { + rt = file2Rt[p]; + } + else + { + rt = file1Rt[p]; + } + + // add the scan + scans[p] = new MsDataScan(massSpectrum: new MzSpectrum(mz, intensities, false), oneBasedScanNumber: p + 1, msnOrder: 1, isCentroid: true, + polarity: Polarity.Positive, retentionTime: rt, scanWindowRange: new MzRange(400, 1600), scanFilter: "f", + mzAnalyzer: MZAnalyzerType.Orbitrap, totalIonCurrent: intensities.Sum(), injectionTime: 1.0, noiseData: null, nativeId: "scan=" + (p + 1)); + } + + // write the .mzML + Readers.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(new FakeMsDataFile(scans), + Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[f] + ".mzML"), false); + } + + // set up spectra file info + SpectraFileInfo file1 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[0] + ".mzML"), "a", 0, 0, 0); + SpectraFileInfo file2 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[1] + ".mzML"), "a", 1, 0, 0); + SpectraFileInfo file3 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[2] + ".mzML"), "a", 2, 0, 0); + + // create some PSMs + var pg = new ProteinGroup("MyProtein", "gene", "org"); + Identification id1 = new Identification(file1, "PEPTIDE", "PEPTIDE", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file1Rt[0] + 0.001, 1, new List { pg }); + Identification id2 = new Identification(file1, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file1Rt[1] + 0.001, 1, new List { pg }); + Identification id3 = new Identification(file1, "PEPTIDEVV", "PEPTIDEVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List { pg }); + Identification id4 = new Identification(file1, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List { pg }); + Identification id5 = new Identification(file1, "PEPTIDEVVVV", "PEPTIDEVVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List { pg }); + + Identification id6 = new Identification(file2, "PEPTIDE", "PEPTIDE", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file2Rt[0] + 0.001, 1, new List { pg }); + Identification id7 = new Identification(file2, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file2Rt[1] + 0.001, 1, new List { pg }); + // missing ID 8 - MBR feature - "PEPTIDEVV" + + Identification id9 = new Identification(file2, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List { pg }); + Identification id10 = new Identification(file2, "PEPTIDEVVVV", "PEPTIDEVVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List { pg }); + + + Identification id11 = new Identification(file3, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file1Rt[1] + 0.001, 1, new List { pg }); // same as peak 2 + Identification id12 = new Identification(file3, "PEPTIDEVV", "PEPTIDEVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List { pg }); // same as peak 3, but higher intensity + Identification id13 = new Identification(file3, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List { pg }); // same as peak 4 + + + // create the FlashLFQ engine + FlashLfqEngine neighborsEngine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id11, id12, id13 }, + matchBetweenRuns: true, donorCriterion: DonorCriterion.Neighbors); + + //run the engine + var results = neighborsEngine.Run(); + + Assert.That(results.Peaks[file2].Count == 5); + Assert.That(results.Peaks[file2].Where(p => p.IsMbrPeak).Count() == 1); + + var peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); + var otherFilePeak = results.Peaks[file1].Where(p => p.Identifications.First().BaseSequence == + peak.Identifications.First().BaseSequence).First(); + + + Assert.That(peak.Intensity > 0); + Assert.That(peak.Intensity == otherFilePeak.Intensity); + Assert.That(peak.Identifications.First().FileInfo == file1); // assure that the ID came from file 1, ie, the donor with the most neighboring peaks + + // create the FlashLFQ engine + FlashLfqEngine intensityEngine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id11, id12, id13 }, + matchBetweenRuns: true, donorCriterion: DonorCriterion.Intensity); + + //run the engine + results = intensityEngine.Run(); + + Assert.That(results.Peaks[file2].Count == 5); + Assert.That(results.Peaks[file2].Where(p => p.IsMbrPeak).Count() == 1); + + peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); + otherFilePeak = results.Peaks[file3].Where(p => p.Identifications.First().BaseSequence == + peak.Identifications.First().BaseSequence).First(); + + + Assert.That(peak.Intensity > 0); + Assert.That(peak.Intensity, Is.EqualTo(otherFilePeak.Intensity/5).Within(1)); // file 3 is five times more intense than file 2 + Assert.That(peak.Identifications.First().FileInfo == file3); // assure that the ID came from file 3, ie, the most intense donor peaks + + } + + } +} diff --git a/mzLib/Transcriptomics/ClassExtensions.cs b/mzLib/Transcriptomics/ClassExtensions.cs new file mode 100644 index 000000000..ef56c737d --- /dev/null +++ b/mzLib/Transcriptomics/ClassExtensions.cs @@ -0,0 +1,119 @@ +using Omics.Modifications; +using System.Text; +using Transcriptomics.Digestion; + +namespace Transcriptomics +{ + public static class ClassExtensions + { + /// + /// Creates a new instance of a nucleic acid or oligo with set modifications, optionally updating its sequence, modifications, and decoy status. + /// + /// The type of the nucleic acid, which must implement . + /// The target nucleic acid or oligo with set modifications to base the new instance on. + /// The new sequence string, if any. If null, the original sequence is used. + /// A dictionary of modifications to apply, if any. If null, the original modifications are used. + /// A flag indicating whether the sequence is a decoy, if any. If null, the original decoy status is used. + /// A new instance of the specified nucleic acid type with the provided or existing properties. + /// + /// This method facilitates the generation of new sequences for both nucleic acids and oligos with set modifications by allowing + /// optional updates to the sequence string, modifications, and decoy status. It ensures that the new instances are properly + /// initialized with the provided or existing properties, enabling further analysis of modified sequences and future generation of decoys on the fly. + /// + public static T CreateNew(this T target, string? sequence = null, IDictionary>? modifications = null, + bool? isDecoy = null) + where T : INucleicAcid + { + // set new object parameters where not null + object? returnObj = null; + string newSequence = sequence ?? target.BaseSequence; + IDictionary> newModifications = modifications ?? target.OneBasedPossibleLocalizedModifications; + + switch (target) + { + case RNA rna: + { + bool newIsDecoy = isDecoy ?? rna.IsDecoy; + returnObj = new RNA(newSequence, rna.Name, rna.Accession, rna.Organism, rna.DatabaseFilePath, + rna.FivePrimeTerminus, rna.ThreePrimeTerminus, newModifications, rna.IsContaminant, newIsDecoy, rna.GeneNames.ToList(), rna.AdditionalDatabaseFields); + break; + } + case OligoWithSetMods oligo: + { + var oldParent = oligo.Parent as RNA ?? throw new NullReferenceException(); + bool newIsDecoy = isDecoy ?? oldParent.IsDecoy; + var newParent = new RNA( + newSequence, + oldParent.Name, + oldParent.Accession, + oldParent.Organism, + oldParent.DatabaseFilePath, + oldParent.FivePrimeTerminus, + oldParent.ThreePrimeTerminus, + newModifications, + oldParent.IsContaminant, + newIsDecoy, + oldParent.GeneNames.ToList(), + oldParent.AdditionalDatabaseFields); + + returnObj = new OligoWithSetMods( + newParent, + (oligo.DigestionParams as RnaDigestionParams)!, + oligo.OneBasedStartResidue, + oligo.OneBasedEndResidue, + oligo.MissedCleavages, + oligo.CleavageSpecificityForFdrCategory, + newModifications.ToDictionary(p => p.Key, p => p.Value.First()), + oligo.NumFixedMods, + oligo.FivePrimeTerminus, + oligo.ThreePrimeTerminus); + break; + } + default: + throw new ArgumentException("INucleicAcid type not yet implemented"); + } + + return (T)returnObj ?? throw new NullReferenceException("Error creating new INucleicAcid"); + } + + /// + /// Transcribes a DNA sequence into an RNA sequence + /// + /// The input dna sequence + /// True if the input sequence is the coding strand, False if the input sequence is the template strand + /// + public static string Transcribe(this string dna, bool isCodingStrand = true) + { + var sb = new StringBuilder(); + foreach (var residue in dna) + { + if (isCodingStrand) + { + sb.Append(residue == 'T' ? 'U' : residue); + } + else + { + switch (residue) + { + case 'A': + sb.Append('U'); + break; + case 'T': + sb.Append('A'); + break; + case 'C': + sb.Append('G'); + break; + case 'G': + sb.Append('C'); + break; + default: + sb.Append(residue); + break; + } + } + } + return sb.ToString(); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs new file mode 100644 index 000000000..d2d41cba7 --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -0,0 +1,178 @@ +using Chemistry; +using Omics.Digestion; +using Omics.Modifications; + +namespace Transcriptomics.Digestion +{ + /// + /// The most basic form of a digested oligo, this class does not care about mass or formula, just base sequence + /// + public class NucleolyticOligo : DigestionProduct + { + protected IHasChemicalFormula _fivePrimeTerminus; + protected IHasChemicalFormula _threePrimeTerminus; + + internal NucleolyticOligo(NucleicAcid nucleicAcid, int oneBaseStartResidue, + int oneBasedEndResidue, int missedCleavages, CleavageSpecificity cleavageSpecificity, + IHasChemicalFormula? fivePrimeTerminus, IHasChemicalFormula? threePrimeTerminus) + : base(nucleicAcid, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, cleavageSpecificity) + { + _fivePrimeTerminus = fivePrimeTerminus ?? NucleicAcid.DefaultFivePrimeTerminus; + _threePrimeTerminus = threePrimeTerminus ?? NucleicAcid.DefaultThreePrimeTerminus; + } + + /// + /// Nucleic acid this oligo was digested from + /// + public NucleicAcid NucleicAcid + { + get => Parent as NucleicAcid; + protected set => Parent = value; + } + + public override string ToString() + { + return BaseSequence; + } + + /// + /// Generates a collection of oligos with set modifications based on the provided fixed and variable modifications, + /// digestion parameters, and the nucleic acid sequence. + /// + /// A collection of all known fixed modifications. + /// Parameters for RNA digestion. + /// A list of variable modifications to consider. + /// An enumerable collection of oligos with set modifications. + /// + /// Code heavily borrowed from ProteolyticPeptide.GetModifiedPeptides + /// + internal IEnumerable GenerateModifiedOligos(List allKnownFixedMods, + RnaDigestionParams digestionParams, List variableModifications) + { + int oligoLength = OneBasedEndResidue - OneBasedStartResidue + 1; + int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; + int maxModsForOligo = digestionParams.MaxMods; + var twoBasedPossibleVariableAndLocalizeableModifications = new Dictionary>(oligoLength + 4); + + var fivePrimeVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(1, fivePrimeVariableMods); + + var threePrimeVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(oligoLength + 2, threePrimeVariableMods); + + // collect all possible variable mods, skipping if there is a database annotated modification + foreach (Modification variableModification in variableModifications) + { + // Check if can be a 5'-term mod + if (CanBeFivePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, 1, variableModification)) + { + fivePrimeVariableMods.Add(variableModification); + } + + for (int r = 0; r < oligoLength; r++) + { + if (variableModification.LocationRestriction == "Anywhere." && + ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) + && !ModificationLocalization.UniprotModExists(NucleicAcid, r + 1, variableModification)) + { + if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) + { + residueVariableMods = new List { variableModification }; + twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + } + // Check if can be a 3'-term mod + if (CanBeThreePrime(variableModification, oligoLength) && !ModificationLocalization.UniprotModExists(NucleicAcid, oligoLength, variableModification)) + { + threePrimeVariableMods.Add(variableModification); + } + } + + // collect all localized modifications from the database. + foreach (var kvp in NucleicAcid.OneBasedPossibleLocalizedModifications) + { + bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; + if (!inBounds) + { + continue; + } + + int locInPeptide = kvp.Key - OneBasedStartResidue + 1; + foreach (Modification modWithMass in kvp.Value) + { + if (modWithMass is Modification variableModification) + { + // Check if can be a 5'-term mod + if (locInPeptide == 1 && CanBeFivePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + { + fivePrimeVariableMods.Add(variableModification); + } + + int r = locInPeptide - 1; + if (r >= 0 && r < oligoLength + && (NucleicAcid.IsDecoy || + (ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, r + 1, oligoLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere."))) + { + if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) + { + residueVariableMods = new List { variableModification }; + twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + + // Check if can be a 3'-term mod + if (locInPeptide == oligoLength && CanBeThreePrime(variableModification, oligoLength) && !NucleicAcid.IsDecoy) + { + threePrimeVariableMods.Add(variableModification); + } + } + } + } + + int variableModificationIsoforms = 0; + + // Add the mods to the oligo by return numerous OligoWithSetMods + foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) + { + int numFixedMods = 0; + foreach (var fixedModPattern in GetFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods)) + { + if (!variableModPattern.ContainsKey(fixedModPattern.Key)) + { + numFixedMods++; + variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); + } + } + yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, + CleavageSpecificityForFdrCategory, variableModPattern, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); + variableModificationIsoforms++; + if (variableModificationIsoforms == maximumVariableModificationIsoforms) + { + yield break; + } + } + } + + private bool CanBeFivePrime(Modification variableModification, int peptideLength) + { + return (variableModification.LocationRestriction == "5'-terminal." || variableModification.LocationRestriction == "Oligo 5'-terminal.") + && ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, 1, peptideLength, OneBasedStartResidue); + } + + private bool CanBeThreePrime(Modification variableModification, int peptideLength) + { + return (variableModification.LocationRestriction == "3'-terminal." || variableModification.LocationRestriction == "Oligo 3'-terminal.") + && ModificationLocalization.ModFits(variableModification, NucleicAcid.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs new file mode 100644 index 000000000..19902f57e --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -0,0 +1,381 @@ +using Chemistry; +using MassSpectrometry; +using Omics.Digestion; +using Omics.Fragmentation; +using Omics.Modifications; +using Omics; +using Easy.Common.Extensions; +using Omics.Fragmentation.Oligo; +using System.Text; + +namespace Transcriptomics.Digestion +{ + /// + /// Represents an oligonucleotide with set modifications, providing properties and methods for + /// accessing and manipulating its chemical characteristics. + /// + /// + /// The monoisotopic mass, most abundant mass, and chemical formula are calculated on the fly if the corresponding properties + /// (_monoisotopicMass, _thisChemicalFormula, _mostAbundantMonoisotopicMass) are null. This ensures that the most up-to-date values are + /// always available based on the current state of the oligonucleotide and its modifications. Therefor, it is important to set those + /// properties to null whenever a termini or modification is changed. + /// + public class OligoWithSetMods : NucleolyticOligo, IBioPolymerWithSetMods, INucleicAcid + { + public OligoWithSetMods(NucleicAcid nucleicAcid, RnaDigestionParams digestionParams, int oneBaseStartResidue, + int oneBasedEndResidue, int missedCleavages, CleavageSpecificity cleavageSpecificity, + Dictionary allModsOneIsNTerminus, int numFixedMods, IHasChemicalFormula? fivePrimeTerminus = null, + IHasChemicalFormula? threePrimeTerminus = null) + : base(nucleicAcid, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, + cleavageSpecificity, fivePrimeTerminus, threePrimeTerminus) + { + _digestionParams = digestionParams; + _allModsOneIsNterminus = allModsOneIsNTerminus; + NumFixedMods = numFixedMods; + FullSequence = this.DetermineFullSequence(); + } + + public OligoWithSetMods(string sequence, Dictionary allKnownMods, int numFixedMods = 0, + RnaDigestionParams digestionParams = null, NucleicAcid n = null, int oneBaseStartResidue = 1, int oneBasedEndResidue = 0, + int missedCleavages = 0, CleavageSpecificity cleavageSpecificity = CleavageSpecificity.Full, string description = null, + IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null) + : base(n, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, + cleavageSpecificity, fivePrimeTerminus, threePrimeTerminus) + { + if (sequence.Contains("|")) + { + throw new MzLibUtil.MzLibException("Ambiguous oligo cannot be parsed from string: " + sequence); + } + + FullSequence = sequence; + _baseSequence = IBioPolymerWithSetMods.GetBaseSequenceFromFullSequence(sequence); + _allModsOneIsNterminus = GetModsAfterDeserialization(allKnownMods); + NumFixedMods = numFixedMods; + _digestionParams = digestionParams; + Description = description; + + if (n != null) + Parent = n; + } + + private RnaDigestionParams _digestionParams; + private Dictionary _allModsOneIsNterminus; + private double? _monoisotopicMass; + private ChemicalFormula? _thisChemicalFormula; + private double? _mostAbundantMonoisotopicMass; + private IDictionary>? _oneBasedPossibleLocalizedModifications; + private string? _sequenceWithChemicalFormula; + + public string FullSequence { get; private set; } + public IDigestionParams DigestionParams => _digestionParams; + public IHasChemicalFormula FivePrimeTerminus + { + get => _fivePrimeTerminus; + set + { + _fivePrimeTerminus = value; + _monoisotopicMass = null; + _thisChemicalFormula = null; + _mostAbundantMonoisotopicMass = null; + } + } + + public IHasChemicalFormula ThreePrimeTerminus + { + get => _threePrimeTerminus; + set + { + _threePrimeTerminus = value; + _monoisotopicMass = null; + _thisChemicalFormula = null; + _mostAbundantMonoisotopicMass = null; + } + } + + public double MonoisotopicMass + { + get + { + _monoisotopicMass ??= BaseSequence.Sum(nuc => Nucleotide.GetResidue(nuc).MonoisotopicMass) + + AllModsOneIsNterminus.Values.Sum(mod => mod.MonoisotopicMass!.Value) + + FivePrimeTerminus.MonoisotopicMass + + ThreePrimeTerminus.MonoisotopicMass; + return _monoisotopicMass.Value; + } + } + + public ChemicalFormula ThisChemicalFormula + { + get + { + if (_thisChemicalFormula is not null) return _thisChemicalFormula!; + + var fullFormula = new RNA(BaseSequence, FivePrimeTerminus, ThreePrimeTerminus).GetChemicalFormula(); + foreach (var mod in AllModsOneIsNterminus.Values) + { + if (mod.ChemicalFormula is null) + { + fullFormula = null; + break; + } + fullFormula.Add(mod.ChemicalFormula); + } + _thisChemicalFormula = fullFormula; + return _thisChemicalFormula!; + } + } + + public double MostAbundantMonoisotopicMass + { + get + { + if (_mostAbundantMonoisotopicMass is not null) return _mostAbundantMonoisotopicMass.Value; + + var distribution = IsotopicDistribution.GetDistribution(ThisChemicalFormula); + double maxIntensity = distribution.Intensities.Max(); + _mostAbundantMonoisotopicMass = distribution.Masses[distribution.Intensities.IndexOf(maxIntensity)].RoundedDouble(); + return _mostAbundantMonoisotopicMass!.Value; + } + } + + public string SequenceWithChemicalFormulas + { + get + { + if (_sequenceWithChemicalFormula is not null) return _sequenceWithChemicalFormula; + + var subsequence = new StringBuilder(); + // variable modification on peptide N-terminus + if (AllModsOneIsNterminus.TryGetValue(1, out Modification? pepNTermVariableMod)) + { + if (pepNTermVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + for (int r = 0; r < Length; r++) + { + subsequence.Append(this[r]); + // variable modification on this residue + if (!AllModsOneIsNterminus.TryGetValue(r + 2, out Modification? residueVariableMod)) continue; + if (residueVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + // variable modification on peptide C-terminus + if (AllModsOneIsNterminus.TryGetValue(Length + 2, out Modification? pepCTermVariableMod)) + { + if (pepCTermVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + _sequenceWithChemicalFormula = subsequence.ToString(); + return _sequenceWithChemicalFormula; + } + } + + public Dictionary AllModsOneIsNterminus => _allModsOneIsNterminus; + + public IDictionary> OneBasedPossibleLocalizedModifications => _oneBasedPossibleLocalizedModifications ??= + _allModsOneIsNterminus.ToDictionary(p => p.Key, p => new List() { p.Value }); + public int NumMods => AllModsOneIsNterminus.Count; + public int NumFixedMods { get; } + public int NumVariableMods => NumMods - NumFixedMods; + + /// + /// Generates theoretical fragments for given dissociation type for this peptide. + /// The "products" parameter is filled with these fragments. + /// + public void Fragment(DissociationType dissociationType, FragmentationTerminus fragmentationTerminus, + List products) + { + products.Clear(); + + List fivePrimeProductTypes = + dissociationType.GetRnaTerminusSpecificProductTypesFromDissociation(FragmentationTerminus.FivePrime); + List threePrimeProductTypes = + dissociationType.GetRnaTerminusSpecificProductTypesFromDissociation(FragmentationTerminus.ThreePrime); + + bool calculateFivePrime = + fragmentationTerminus is FragmentationTerminus.FivePrime or FragmentationTerminus.Both; + bool calculateThreePrime = + fragmentationTerminus is FragmentationTerminus.ThreePrime or FragmentationTerminus.Both; + + var sequence = (Parent as NucleicAcid)!.NucleicAcidArray[(OneBasedStartResidue - 1)..OneBasedEndResidue]; + + // intact product ion + if (fragmentationTerminus is FragmentationTerminus.Both or FragmentationTerminus.None) + products.AddRange(GetNeutralFragments(ProductType.M, sequence)); + + if (calculateFivePrime) + foreach (var type in fivePrimeProductTypes) + products.AddRange(GetNeutralFragments(type, sequence)); + + if (calculateThreePrime) + foreach (var type in threePrimeProductTypes) + products.AddRange(GetNeutralFragments(type, sequence)); + } + + /// + /// Generates theoretical internal fragments for given dissociation type for this peptide. + /// The "products" parameter is filled with these fragments. + /// The "minLengthOfFragments" parameter is the minimum number of nucleic acids for an internal fragment to be included + /// + public void FragmentInternally(DissociationType dissociationType, int minLengthOfFragments, + List products) + { + throw new NotImplementedException(); + } + + /// + /// Calculates all the fragments of the types you specify + /// + /// product type to get neutral fragments from + /// Sequence to generate fragments from, will be calculated from the parent if left null + /// + public IEnumerable GetNeutralFragments(ProductType type, Nucleotide[]? sequence = null) + { + sequence ??= (Parent as NucleicAcid)!.NucleicAcidArray[(OneBasedStartResidue - 1)..OneBasedEndResidue]; + + if (type is ProductType.M) + { + yield return new Product(type, FragmentationTerminus.None, MonoisotopicMass, 0, 0, 0); + yield break; + } + + // determine mass of piece remaining after fragmentation + double monoMass = type.GetRnaMassShiftFromProductType(); + + // determine mass of terminal cap and add to fragment + bool isThreePrimeTerminal = type.GetRnaTerminusType() == FragmentationTerminus.ThreePrime; + IHasChemicalFormula terminus = isThreePrimeTerminal ? ThreePrimeTerminus : FivePrimeTerminus; + monoMass += terminus.MonoisotopicMass; + + // determine mass of each polymer component that is contained within the fragment and add to fragment + bool first = true; //set first to true to hand the terminus mod first + for (int i = 0; i <= BaseSequence.Length - 1; i++) + { + int naIndex = isThreePrimeTerminal ? Length - i : i - 1; + if (first) + { + first = false; //set to false so only handled once + continue; + } + monoMass += sequence[naIndex].MonoisotopicMass; + + if (i < 1) + continue; + + // add side-chain mod + if (AllModsOneIsNterminus.TryGetValue(naIndex + 2, out Modification mod)) + { + monoMass += mod.MonoisotopicMass ?? 0; + } + + var previousNucleotide = sequence[naIndex]; + + double neutralLoss = 0; + if (type.ToString().Contains("Base")) + { + neutralLoss = previousNucleotide.BaseChemicalFormula.MonoisotopicMass; + } + + yield return new Product(type, + isThreePrimeTerminal ? FragmentationTerminus.ThreePrime : FragmentationTerminus.FivePrime, + monoMass - neutralLoss, i, + isThreePrimeTerminal ? BaseSequence.Length - i : i, 0, null, 0); + } + } + + /// + /// Outputs a duplicate IBioPolymerWithSetMods with a localized mass shift, replacing a modification when present + /// + /// Used to localize an unknown mass shift in the MetaMorpheus Localization Engine + /// + /// + /// The index of the modification in the AllModOneIsNTerminus Dictionary - 2 (idk why -2) + /// The mass to add to the BioPolymer + public IBioPolymerWithSetMods Localize(int indexOfMass, double massToLocalize) + { + var dictWithLocalizedMass = new Dictionary(AllModsOneIsNterminus); + double massOfExistingMod = 0; + if (dictWithLocalizedMass.TryGetValue(indexOfMass + 2, out Modification modToReplace)) + { + massOfExistingMod = (double)modToReplace.MonoisotopicMass; + dictWithLocalizedMass.Remove(indexOfMass + 2); + } + + dictWithLocalizedMass.Add(indexOfMass + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); + + var peptideWithLocalizedMass = new OligoWithSetMods(NucleicAcid, _digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, + CleavageSpecificityForFdrCategory, dictWithLocalizedMass, NumFixedMods, FivePrimeTerminus, ThreePrimeTerminus); + + return peptideWithLocalizedMass; + } + + private Dictionary GetModsAfterDeserialization(Dictionary idToMod) + { + var mods = new Dictionary(); + int currentModStart = 0; + int currentModificationLocation = 1; + bool currentlyReadingMod = false; + int bracketCount = 0; + + for (int r = 0; r < FullSequence.Length; r++) + { + char c = FullSequence[r]; + if (c == '[') + { + currentlyReadingMod = true; + if (bracketCount == 0) + { + currentModStart = r + 1; + } + + bracketCount++; + } + else if (c == ']') + { + string modId = null; + bracketCount--; + if (bracketCount == 0) + { + try + { + //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") + string modString = FullSequence.Substring(currentModStart, r - currentModStart); + int splitIndex = modString.IndexOf(':'); + string modType = modString.Substring(0, splitIndex); + modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); + } + catch (Exception e) + { + throw new MzLibUtil.MzLibException( + "Error while trying to parse string into peptide: " + e.Message, e); + } + + if (!idToMod.TryGetValue(modId, out Modification mod)) + { + throw new MzLibUtil.MzLibException( + "Could not find modification while reading string: " + FullSequence); + } + + if (mod.LocationRestriction.Contains("3'-terminal.") && r == FullSequence.Length - 1) + { + currentModificationLocation = BaseSequence.Length + 2; + } + + mods.Add(currentModificationLocation, mod); + currentlyReadingMod = false; + } + } + else if (!currentlyReadingMod) + { + currentModificationLocation++; + } + //else do nothing + } + + return mods; + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs b/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs new file mode 100644 index 000000000..fb80a1a0b --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs @@ -0,0 +1,45 @@ +using Omics.Digestion; +using Omics.Fragmentation; + +namespace Transcriptomics.Digestion +{ + public class RnaDigestionParams : IDigestionParams + { + + // this parameterless constructor needs to exist to read the toml. + public RnaDigestionParams() : this("top-down") + { + } + + public RnaDigestionParams(string rnase = "top-down", int maxMissedCleavages = 0, int minLength = 3, + int maxLength = int.MaxValue, int maxModificationIsoforms = 1024, int maxMods = 2, + FragmentationTerminus fragmentationTerminus = FragmentationTerminus.Both) + { + Rnase = RnaseDictionary.Dictionary[rnase]; + MaxMissedCleavages = maxMissedCleavages; + MinLength = minLength; + MaxLength = maxLength; + MaxMods = maxMods; + MaxModificationIsoforms = maxModificationIsoforms; + FragmentationTerminus = fragmentationTerminus; + } + + public int MaxMissedCleavages { get; set; } + public int MinLength { get; set; } + public int MaxLength { get; set; } + public int MaxModificationIsoforms { get; set; } + public int MaxMods { get; set; } + public DigestionAgent DigestionAgent => Rnase; + public Rnase Rnase { get; private set; } + public FragmentationTerminus FragmentationTerminus { get; set; } + public CleavageSpecificity SearchModeType { get; set; } = CleavageSpecificity.Full; + public IDigestionParams Clone(FragmentationTerminus? newTerminus = null) + { + return newTerminus.HasValue + ? new RnaDigestionParams(Rnase.Name, MaxMissedCleavages, MinLength, MaxLength, + MaxModificationIsoforms, MaxMods, newTerminus.Value) + : new RnaDigestionParams(Rnase.Name, MaxMissedCleavages, MinLength, MaxLength, + MaxModificationIsoforms, MaxMods, FragmentationTerminus); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/Rnase.cs b/mzLib/Transcriptomics/Digestion/Rnase.cs index 646bbc8d1..3670f1b3c 100644 --- a/mzLib/Transcriptomics/Digestion/Rnase.cs +++ b/mzLib/Transcriptomics/Digestion/Rnase.cs @@ -1,4 +1,5 @@ -using Omics.Digestion; +using Chemistry; +using Omics.Digestion; using Omics.Modifications; namespace Transcriptomics.Digestion @@ -13,10 +14,59 @@ public Rnase(string name, CleavageSpecificity cleaveSpecificity, List GetUnmodifiedOligos(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, int maxLength) - // private IEnumerable FullDigestion(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, int maxLength) - + public List GetUnmodifiedOligos(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, + int maxLength) + { + var oligos = new List(); + + // top down + if (CleavageSpecificity == CleavageSpecificity.None) + { + if (ValidLength(nucleicAcid.Length, minLength, maxLength)) + oligos.Add(new NucleolyticOligo(nucleicAcid, 1, nucleicAcid.Length, + 0, CleavageSpecificity.Full, nucleicAcid.FivePrimeTerminus, nucleicAcid.ThreePrimeTerminus)); + } + // full cleavage + else if (CleavageSpecificity == CleavageSpecificity.Full) + { + oligos.AddRange(FullDigestion(nucleicAcid, maxMissedCleavages, minLength, maxLength)); + } + else + { + throw new ArgumentException( + "Cleave Specificity not defined for Rna digestion, currently supports Full and None"); + } + + return oligos; + } + + private IEnumerable FullDigestion(NucleicAcid nucleicAcid, int maxMissedCleavages, + int minLength, int maxLength) + { + List oneBasedIndicesToCleaveAfter = GetDigestionSiteIndices(nucleicAcid.BaseSequence); + for (int missedCleavages = 0; missedCleavages <= maxMissedCleavages; missedCleavages++) + { + for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - missedCleavages - 1; i++) + { + if (ValidLength(oneBasedIndicesToCleaveAfter[i + missedCleavages + 1] - oneBasedIndicesToCleaveAfter[i], + minLength, maxLength)) + { + int oneBasedStartResidue = oneBasedIndicesToCleaveAfter[i] + 1; + int oneBasedEndResidue = oneBasedIndicesToCleaveAfter[i + missedCleavages + 1]; + + // contains original 5' terminus ? keep it : set to OH + IHasChemicalFormula fivePrimeTerminus = oneBasedStartResidue == 1 ? nucleicAcid.FivePrimeTerminus : ChemicalFormula.ParseFormula("O-3P-1"); + + // contains original 3' terminus ? keep it : set to phosphate + IHasChemicalFormula threePrimeTerminus = oneBasedEndResidue == nucleicAcid.Length ? nucleicAcid.ThreePrimeTerminus : ChemicalFormula.ParseFormula("H2O4P"); + + yield return new NucleolyticOligo(nucleicAcid, oneBasedStartResidue, oneBasedEndResidue, + missedCleavages, CleavageSpecificity.Full, fivePrimeTerminus, threePrimeTerminus); + } + } + } + } + public bool Equals(Rnase? other) { if (ReferenceEquals(null, other)) return false; diff --git a/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs b/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs index 3d55d2ef4..4e3e95e4d 100644 --- a/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs +++ b/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs @@ -1,10 +1,9 @@ using Chemistry; -using Omics; using Omics.Modifications; namespace Transcriptomics { - public interface INucleicAcid : IHasChemicalFormula, IBioPolymer + public interface INucleicAcid : IHasChemicalFormula { /// /// The amino acid sequence diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs new file mode 100644 index 000000000..ff8b7774d --- /dev/null +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -0,0 +1,349 @@ +using Chemistry; +using Omics.Digestion; +using Omics.Modifications; +using Omics; +using System.Text; +using MzLibUtil; +using Transcriptomics.Digestion; + +namespace Transcriptomics +{ + /// + /// A linear polymer of Nucleic acids + /// + public abstract class NucleicAcid : INucleicAcid, IBioPolymer, IEquatable + { + #region Static Properties + + /// + /// The default chemical formula of the five prime (hydroxyl group) + /// + /// + /// This means that the five prime cap will remove the excess components of first nucleotides + /// phospho group, leaving only the hydroxyl. This formula will be used for the five prime cap, unless + /// the nucleic acid is constructed with a different chemical formula + /// + public static readonly ChemicalFormula DefaultFivePrimeTerminus = ChemicalFormula.ParseFormula("O-3P-1"); + + /// + /// The default chemical formula of the three prime terminus (hydroxyl group) + /// + /// + /// This is used to account for the mass of the additional hydroxyl group at the three end of most oligonucleotides. + /// This formula will be used for the three prime cap, unless the nucleic acid is constructed with a different + /// chemical formula + /// + public static readonly ChemicalFormula DefaultThreePrimeTerminus = ChemicalFormula.ParseFormula("OH"); + + #endregion + + #region Constuctors + + /// + /// For creating an RNA programatically + /// + protected NucleicAcid(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null) + { + MonoisotopicMass = 0; + _nucleicAcids = new Nucleotide[sequence.Length]; + ThreePrimeTerminus = threePrimeTerm ??= DefaultThreePrimeTerminus; + FivePrimeTerminus = fivePrimeTerm ??= DefaultFivePrimeTerminus; + _oneBasedPossibleLocalizedModifications = oneBasedPossibleLocalizedModifications ?? new Dictionary>(); + GeneNames = new List>(); + + ParseSequenceString(sequence); + } + + /// + /// For Reading in from rna database + /// + protected NucleicAcid(string sequence, string name, string identifier, string organism, string databaseFilePath, + IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null, + bool isContaminant = false, bool isDecoy = false, List>? geneNames = null, + Dictionary? additionalDatabaseFields = null) + : this(sequence, fivePrimeTerm, threePrimeTerm, oneBasedPossibleLocalizedModifications) + { + Name = name; + DatabaseFilePath = databaseFilePath; + IsDecoy = isDecoy; + IsContaminant = isContaminant; + Organism = organism; + Accession = identifier; + AdditionalDatabaseFields = additionalDatabaseFields; + GeneNames = geneNames ?? new List>(); + } + + #endregion + + #region Private Properties + + /// + /// The 5-Prime chemical formula cap + /// + private IHasChemicalFormula _5PrimeTerminus; + + /// + /// The 3-Prime chemical formula cap + /// + private IHasChemicalFormula _3PrimeTerminus; + + /// + /// All of the nucleic acid residues indexed by position from 5- to 3-prime. + /// + private Nucleotide[] _nucleicAcids; + + /// + /// The nucleic acid sequence. Is ignored if 'StoreSequenceString' is false + /// + private string _sequence; + + private IDictionary> _oneBasedPossibleLocalizedModifications; + + #endregion + + #region Public Properties + + /// + /// Gets or sets the 5' terminus of this nucleic acid polymer + /// + public IHasChemicalFormula FivePrimeTerminus + { + get => _5PrimeTerminus; + set => ReplaceTerminus(ref _5PrimeTerminus, value); + } + + /// + /// Gets or sets the 3' terminus of this nucleic acid polymer + /// + public IHasChemicalFormula ThreePrimeTerminus + { + get => _3PrimeTerminus; + set => ReplaceTerminus(ref _3PrimeTerminus, value); + } + + /// + /// Gets the number of nucleic acids in this nucleic acid polymer + /// + public int Length => BaseSequence.Length; + + public string Name { get; } + public string FullName => Name; // TODO: Consider if this needs to be different from the name + public string DatabaseFilePath { get; } + public bool IsDecoy { get; } + public bool IsContaminant { get; } + public string Accession { get; } + + public IDictionary> OneBasedPossibleLocalizedModifications => _oneBasedPossibleLocalizedModifications; + public string Organism { get; } + + /// + /// The list of gene names consists of tuples, where Item1 is the type of gene name, and Item2 is the name. There may be many genes and names of a certain type produced when reading an XML protein database. + /// + public IEnumerable> GeneNames { get; } + public Dictionary? AdditionalDatabaseFields { get; } + + /// + /// The total monoisotopic mass of this peptide and all of its modifications + /// + public double MonoisotopicMass { get; private set; } + + /// + /// Returns a copy of the nucleic acid array, used for -base mass calculations. + /// + public Nucleotide[] NucleicAcidArray => _nucleicAcids; + + public ChemicalFormula ThisChemicalFormula => GetChemicalFormula(); + + #endregion + + #region Nucleic Acid Sequence + + /// + /// Gets the base nucleic acid sequence + /// + public string BaseSequence + { + get + { + // Generate the sequence if the stored version is null or empty + if (string.IsNullOrEmpty(_sequence)) + { + _sequence = new string(_nucleicAcids.Select(na => na.Letter).ToArray()); + } + + return _sequence; + } + } + + public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex]; + + #endregion + + #region Digestion + + public IEnumerable Digest(IDigestionParams digestionParameters, List allKnownFixedMods, + List variableModifications, List silacLabels = null, (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, + bool topDownTruncationSearch = false) + { + if (digestionParameters is not RnaDigestionParams digestionParams) + throw new MzLibException( + "DigestionParameters must be of type DigestionParams for protein digestion", new ArgumentException()); + allKnownFixedMods ??= new(); + variableModifications ??= new(); + + // digest based upon base sequence + foreach (var unmodifiedOligo in digestionParams.Rnase.GetUnmodifiedOligos(this, + digestionParams.MaxMissedCleavages, digestionParams.MinLength, digestionParams.MaxLength)) + { + // add fixed and variable mods to base sequence digestion products + foreach (var modifiedOligo in unmodifiedOligo.GenerateModifiedOligos(allKnownFixedMods, digestionParams, + variableModifications)) + { + yield return modifiedOligo; + } + } + } + + public IEnumerable Digest(RnaDigestionParams digestionParameters, + List allKnownFixedMods, + List variableModifications, List silacLabels = null, + (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, + bool topDownTruncationSearch = false) + { + return Digest((IDigestionParams)digestionParameters, allKnownFixedMods, variableModifications, silacLabels, turnoverLabels, topDownTruncationSearch) + .Cast(); + } + + #endregion + + #region Electrospray + + public IEnumerable GetElectrospraySeries(int minCharge, int maxCharge) + { + if (minCharge > maxCharge) + (minCharge, maxCharge) = (maxCharge, minCharge); + + for (int i = maxCharge; i > minCharge - 1; i--) + yield return this.ToMz(i); + } + + #endregion + + #region Chemical Formula + + public ChemicalFormula GetChemicalFormula() + { + var formula = new ChemicalFormula(); + + // Handle 5'-Terminus + formula.Add(FivePrimeTerminus.ThisChemicalFormula); + + // Handle 3'-Terminus + formula.Add(ThreePrimeTerminus.ThisChemicalFormula); + + // Handle Nucleic Acid Residues + for (int i = 0; i < Length; i++) + { + formula.Add(_nucleicAcids[i].ThisChemicalFormula); + } + + return formula; + } + + #endregion + + #region Private Methods + + private void ReplaceTerminus(ref IHasChemicalFormula? terminus, IHasChemicalFormula? value) + { + if (Equals(value, terminus)) + return; + + if (terminus != null) + MonoisotopicMass -= terminus.MonoisotopicMass; + + terminus = value; + + if (value != null) + MonoisotopicMass += value.MonoisotopicMass; + } + + /// + /// Parses a string sequence of nucleic acid characters into an array of Nucleotide objects, + /// updates the sequence string, and calculates the monoisotopic mass. + /// + /// The string sequence of nucleic acid characters to parse. + private void ParseSequenceString(string sequence) + { + if (string.IsNullOrEmpty(sequence)) + return; + + int index = 0; + double monoMass = 0; + + StringBuilder sb = null; + sb = new StringBuilder(sequence.Length); + + foreach (char letter in sequence) + { + Nucleotide residue; + if (Nucleotide.TryGetResidue(letter, out residue)) + { + _nucleicAcids[index++] = residue; + sb.Append(residue.Letter); + monoMass += residue.MonoisotopicMass; + } + else + { + switch (letter) + { + case ' ': // ignore spaces + break; + + case '*': // ignore * + break; + + default: + throw new ArgumentException(string.Format( + "Nucleic Acid Letter {0} does not exist in the Nucleic Acid Dictionary. {0} is also not a valid character", + letter)); + } + } + } + + _sequence = sb.ToString(); + MonoisotopicMass += monoMass; + Array.Resize(ref _nucleicAcids, Length); + } + + #endregion + + #region Interface Implemntations and Overrides + + public bool Equals(NucleicAcid? other) + { + if (ReferenceEquals(null, other)) return false; + if (ReferenceEquals(this, other)) return true; + return _sequence == other._sequence + && _5PrimeTerminus.Equals(other._5PrimeTerminus) + && _3PrimeTerminus.Equals(other._3PrimeTerminus); + } + + public override bool Equals(object? obj) + { + if (ReferenceEquals(null, obj)) return false; + if (ReferenceEquals(this, obj)) return true; + if (obj.GetType() != this.GetType()) return false; + return Equals((NucleicAcid)obj); + } + + public override int GetHashCode() + { + return HashCode.Combine(_5PrimeTerminus, _3PrimeTerminus, _sequence); + } + + #endregion + } +} diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs new file mode 100644 index 000000000..41e3a64e9 --- /dev/null +++ b/mzLib/Transcriptomics/RNA.cs @@ -0,0 +1,47 @@ +using Chemistry; +using Omics.Modifications; + +namespace Transcriptomics +{ + public class RNA : NucleicAcid + { + /// + /// For constructing RNA from a string + /// + /// + /// + /// + /// + public RNA(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null) + : base(sequence, fivePrimeTerm, threePrimeTerm, oneBasedPossibleLocalizedModifications) + { + } + + /// + /// For use with RNA loaded from a database + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + public RNA(string sequence, string name, string identifier, string organism, string databaseFilePath, + IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null, + IDictionary>? oneBasedPossibleModifications = null, + bool isContaminant = false, bool isDecoy = false, List> geneNames = null, + Dictionary? databaseAdditionalFields = null) + : base(sequence, name, identifier, organism, databaseFilePath, fivePrimeTerminus, threePrimeTerminus, + oneBasedPossibleModifications, isContaminant, isDecoy, geneNames, databaseAdditionalFields) + { + + } + } +} diff --git a/mzLib/Transcriptomics/Transcriptomics.csproj b/mzLib/Transcriptomics/Transcriptomics.csproj index a670300ca..f9962577c 100644 --- a/mzLib/Transcriptomics/Transcriptomics.csproj +++ b/mzLib/Transcriptomics/Transcriptomics.csproj @@ -1,7 +1,7 @@ - net6.0 + net8.0 x64 enable enable @@ -12,6 +12,10 @@ true + + + + diff --git a/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs b/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs index b70e3dc23..51978b2db 100644 --- a/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs +++ b/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs @@ -19,5 +19,17 @@ public FastaHeaderFieldRegex(string fieldName, string regularExpression, int mat public int Match { get; } public int Group { get; } + + public string ApplyRegex(string input) + { + string? result = null; + var matches = Regex.Matches(input); + if (matches.Count > Match && matches[Match].Groups.Count > Group) + { + result = matches[Match].Groups[Group].Value; + } + + return result!; + } } } \ No newline at end of file diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index 8544c2233..b5a680a5e 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -402,7 +402,7 @@ public static IEnumerable MergeProteins(IEnumerable mergeThese } } - private static string ApplyRegex(FastaHeaderFieldRegex regex, string line) + internal static string ApplyRegex(FastaHeaderFieldRegex regex, string line) { string result = null; if (regex != null) @@ -416,7 +416,7 @@ private static string ApplyRegex(FastaHeaderFieldRegex regex, string line) return result; } - private static Dictionary> GetModificationDict(IEnumerable mods) + internal static Dictionary> GetModificationDict(IEnumerable mods) { var mod_dict = new Dictionary>(); @@ -436,7 +436,7 @@ private static Dictionary> GetModificationDict(IEnum return mod_dict; } - private static Dictionary GetModificationDictWithMotifs(IEnumerable mods) + internal static Dictionary GetModificationDictWithMotifs(IEnumerable mods) { var mod_dict = new Dictionary(); diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 155945558..dadba9e11 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -5,12 +5,287 @@ using System.IO; using System.Linq; using System.Xml; +using Easy.Common.Extensions; +using Omics; using Omics.Modifications; +using Transcriptomics; namespace UsefulProteomicsDatabases { + + /// + /// Provides methods for writing protein and nucleic acid databases to XML and FASTA formats. + /// Did not rename to DbWriter to ensure compatibility with the original UsefulProteomicsDatabases namespace. + /// public class ProteinDbWriter { + /// + /// Writes an XML database for a list of RNA sequences, including additional modifications. + /// + /// A dictionary of additional modifications to add to proteins. + /// A list of RNA sequences to be written to the database. + /// The name of the output XML file. + /// A dictionary of new modification residue entries. + public static Dictionary WriteXmlDatabase( + Dictionary>> additionalModsToAddToProteins, + List bioPolymerList, string outputFileName) => WriteNucleicAcidXmlDatabase(additionalModsToAddToProteins, bioPolymerList.Cast().ToList(), outputFileName); + + /// + /// Writes an XML database for a list of nucleic acid sequences, including additional modifications. + /// + /// A dictionary of additional modifications to add to proteins. + /// A list of nucleic acid sequences to be written to the database. + /// The name of the output XML file. + /// A dictionary of new modification residue entries. + /// + /// Several chunks of code are commented out. These are blocks that are intended to be implmented in the future, but + /// are not necessary for the bare bones implementation of Transcriptomics + /// + private static Dictionary WriteNucleicAcidXmlDatabase( + Dictionary>> additionalModsToAddToProteins, + List nucleicAcidList, string outputFileName) + { + additionalModsToAddToProteins = additionalModsToAddToProteins ?? new Dictionary>>(); + var xmlWriterSettings = new XmlWriterSettings + { + Indent = true, + IndentChars = " " + }; + + Dictionary newModResEntries = new Dictionary(); + using (XmlWriter writer = XmlWriter.Create(outputFileName, xmlWriterSettings)) + { + writer.WriteStartDocument(); + writer.WriteStartElement("mzLibProteinDb"); + + List myModificationList = new List(); + foreach (var p in nucleicAcidList) + { + foreach (KeyValuePair> entry in p.OneBasedPossibleLocalizedModifications) + { + myModificationList.AddRange(entry.Value); + } + } + + // get modifications from nucleic acid list and concatenate the modifications discovered in GPTMDictionary + var allRelevantModifications = + new HashSet(nucleicAcidList + .SelectMany(p => p.OneBasedPossibleLocalizedModifications.SelectMany(m => m.Value)) + .Concat(additionalModsToAddToProteins + .Where(n => nucleicAcidList.Select(nu => nu.Accession).Contains(n.Key)) + .SelectMany(kv => kv.Value.Select(v => v.Item2)))); + + foreach (Modification mod in allRelevantModifications.OrderBy(m => m.IdWithMotif)) + { + writer.WriteStartElement("modification"); + writer.WriteString(mod.ToString() + Environment.NewLine + "//"); + writer.WriteEndElement(); + } + + foreach (var nucleicAcid in nucleicAcidList) + { + writer.WriteStartElement("entry"); + writer.WriteStartElement("accession"); + writer.WriteString(nucleicAcid.Accession); + writer.WriteEndElement(); + + if (nucleicAcid.Name.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("name"); + writer.WriteString(nucleicAcid.Name); + writer.WriteEndElement(); + } + + if (nucleicAcid.FullName.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("protein"); + writer.WriteStartElement("recommendedName"); + writer.WriteStartElement("fullName"); + writer.WriteString(nucleicAcid.FullName); + writer.WriteEndElement(); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + + writer.WriteStartElement("gene"); + foreach (var geneName in nucleicAcid.GeneNames) + { + writer.WriteStartElement("name"); + writer.WriteAttributeString("type", geneName.Item1); + writer.WriteString(geneName.Item2); + writer.WriteEndElement(); + } + writer.WriteEndElement(); + + if (nucleicAcid.Organism.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("organism"); + writer.WriteStartElement("name"); + writer.WriteAttributeString("type", "scientific"); + writer.WriteString(nucleicAcid.Organism); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + + //foreach (var dbRef in nucleicAcid) + //{ + // writer.WriteStartElement("dbReference"); + // writer.WriteAttributeString("type", dbRef.Type); + // writer.WriteAttributeString("id", dbRef.Id); + // foreach (Tuple property in dbRef.Properties) + // { + // writer.WriteStartElement("property"); + // writer.WriteAttributeString("type", property.Item1); + // writer.WriteAttributeString("value", property.Item2); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); + //} + + ////for now we are not going to write top-down truncations generated for top-down truncation search. + ////some day we could write those if observed + ////the truncation designation is contained in the "type" field of ProteolysisProduct + //List proteolysisProducts = nucleicAcid.ProteolysisProducts.Where(p => !p.Type.Contains("truncation")).ToList(); + //foreach (var proteolysisProduct in proteolysisProducts) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", proteolysisProduct.Type.Split('(')[0]); + // writer.WriteStartElement("location"); + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", proteolysisProduct.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", proteolysisProduct.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + //} + + foreach (var hm in GetModsForThisBioPolymer(nucleicAcid, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) + { + foreach (var modId in hm.Value) + { + writer.WriteStartElement("feature"); + writer.WriteAttributeString("type", "modified residue"); + writer.WriteAttributeString("description", modId); + writer.WriteStartElement("location"); + writer.WriteStartElement("position"); + writer.WriteAttributeString("position", hm.Key.ToString(CultureInfo.InvariantCulture)); + writer.WriteEndElement(); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + } + + //foreach (var hm in nucleicAcid.SequenceVariations) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "sequence variant"); + // writer.WriteAttributeString("description", hm.Description.ToString()); + // writer.WriteStartElement("original"); + // writer.WriteString(hm.OriginalSequence); + // writer.WriteEndElement(); // original + // writer.WriteStartElement("variation"); + // writer.WriteString(hm.VariantSequence); + // writer.WriteEndElement(); // variation + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // foreach (var hmm in GetModsForThisProtein(nucleicAcid, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) + // { + // foreach (var modId in hmm.Value) + // { + // writer.WriteStartElement("subfeature"); + // writer.WriteAttributeString("type", "modified residue"); + // writer.WriteAttributeString("description", modId); + // writer.WriteStartElement("location"); + // writer.WriteStartElement("subposition"); + // writer.WriteAttributeString("subposition", hmm.Key.ToString(CultureInfo.InvariantCulture)); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // } + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + //foreach (var hm in nucleicAcid.DisulfideBonds) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "disulfide bond"); + // writer.WriteAttributeString("description", hm.Description); + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + //foreach (var hm in nucleicAcid.SpliceSites) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "splice site"); + // writer.WriteAttributeString("description", hm.Description); + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + writer.WriteStartElement("sequence"); + writer.WriteAttributeString("length", nucleicAcid.Length.ToString(CultureInfo.InvariantCulture)); + writer.WriteString(nucleicAcid.BaseSequence); + writer.WriteEndElement(); // sequence + writer.WriteEndElement(); // entry + } + + writer.WriteEndElement(); // mzLibProteinDb + writer.WriteEndDocument(); + } + return newModResEntries; + } + /// /// Writes a protein database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list. /// @@ -138,7 +413,7 @@ public static Dictionary WriteXmlDatabase(Dictionary b.Key)) + foreach (var hm in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) { foreach (var modId in hm.Value) { @@ -181,7 +456,7 @@ public static Dictionary WriteXmlDatabase(Dictionary b.Key)) + foreach (var hmm in GetModsForThisBioPolymer(protein, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) { foreach (var modId in hmm.Value) { @@ -276,7 +551,7 @@ public static void WriteFastaDatabase(List proteinList, string outputFi } } - private static Dictionary> GetModsForThisProtein(Protein protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries) + private static Dictionary> GetModsForThisBioPolymer(IBioPolymer protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries) { var modsToWriteForThisSpecificProtein = new Dictionary>(); @@ -292,7 +567,8 @@ private static Dictionary> GetModsForThisProtein(Protein pr } } - string accession = seqvar == null ? protein.Accession : VariantApplication.GetAccession(protein, new[] { seqvar }); + // This cast to protein is okay as no sequence variation is programmed to RNA as of 9/24/24 + string accession = seqvar == null ? protein.Accession : VariantApplication.GetAccession(protein as Protein, new[] { seqvar }); if (additionalModsToAddToProteins.ContainsKey(accession)) { foreach (var ye in additionalModsToAddToProteins[accession]) diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index a93c896e7..becfa2cfa 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -5,6 +5,8 @@ using System.Text.RegularExpressions; using System.Xml; using Omics.Modifications; +using Transcriptomics; +using UsefulProteomicsDatabases.Transcriptomics; namespace UsefulProteomicsDatabases { @@ -182,6 +184,38 @@ public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExcl return protein; } + internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExclude, + Dictionary unknownModifications, + bool isContaminant, string rnaDbLocation) + { + RNA result = null; + if (xml.Name == "feature") + { + ParseFeatureEndElement(xml, modTypesToExclude, unknownModifications); + } + if (xml.Name == "subfeature") + { + ParseSubFeatureEndElement(xml, modTypesToExclude, unknownModifications); + } + else if (xml.Name == "dbReference") + { + ParseDatabaseReferenceEndElement(xml); + } + else if (xml.Name == "gene") + { + ReadingGene = false; + } + else if (xml.Name == "organism") + { + ReadingOrganism = false; + } + else if (xml.Name == "entry") + { + result = ParseRnaEntryEndElement(xml, isContaminant, rnaDbLocation, modTypesToExclude, unknownModifications); + } + return result; + } + /// /// Finish parsing an entry /// @@ -202,6 +236,24 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr return result; } + internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string rnaDbLocation, + IEnumerable modTypesToExclude, Dictionary unknownModifications) + { + RNA result = null; + if (Accession != null && Sequence != null) + { + // sanitize the sequence to replace unexpected characters with X (unknown amino acid) + // sometimes strange characters get added by RNA sequencing software, etc. + Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X'); + + ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods); + result = new RNA(Sequence, Name, Accession, Organism, rnaDbLocation, null, + null, OneBasedModifications, isContaminant, false, GeneNames, null); + } + Clear(); + return result; + } + /// /// Finish parsing a subfeature element /// @@ -304,7 +356,8 @@ private static void ParseAnnotatedMods(Dictionary> desti string annotatedId = annotatedMod.Item2; int annotatedModLocation = annotatedMod.Item1; - if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)) + if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod) + || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod)) { // if the list of known mods contains this IdWithMotif if (!modTypesToExclude.Contains(foundMod.ModificationType)) @@ -322,7 +375,8 @@ private static void ParseAnnotatedMods(Dictionary> desti } // no known mod - try looking it up in the dictionary of mods without motif appended - else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods)) + else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods) + || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods)) { foreach (Modification mod in mods) { @@ -352,19 +406,6 @@ private static void ParseAnnotatedMods(Dictionary> desti } } - private static ModificationMotif GetMotif(string proteinSequence, int position) - { - string aminoAcid = proteinSequence.Substring(position - 1, 1); - if (ModificationMotif.TryGetMotif(aminoAcid, out ModificationMotif motif)) - { - return motif; - } - else - { - return null; - } - } - /// /// Finish parsing a database reference element /// diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs new file mode 100644 index 000000000..2e80c090c --- /dev/null +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs @@ -0,0 +1,261 @@ +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.IO.Compression; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using System.Xml; +using Chemistry; +using Transcriptomics; + +namespace UsefulProteomicsDatabases.Transcriptomics +{ + public enum RnaFastaHeaderType + { + Modomics, + Unknown, + } + + public static class RnaDbLoader + { + + #region Header Detection and Property Regexes + + public static RnaFastaHeaderType DetectRnaFastaHeaderType(string line) + { + if (line.StartsWith(">id")) + return RnaFastaHeaderType.Modomics; + + return RnaFastaHeaderType.Unknown; + } + + /// + /// Dictionary that extract accession number, species, name, and additional dataField of modomics + /// + public static readonly Dictionary ModomicsFieldRegexes = + new Dictionary() + { + { "Id", new FastaHeaderFieldRegex("Id", @"id:(?.+?)\|", 0, 1) }, + { "Name", new FastaHeaderFieldRegex("Name", @"Name:(?.+?)\|", 0, 1) }, + { "SOterm", new FastaHeaderFieldRegex("SOterm", @"SOterm:(?.+?)\|", 0, 1) }, + { "Type", new FastaHeaderFieldRegex("Type", @"Type:(?.+?)\|", 0, 1) }, + { "Subtype", new FastaHeaderFieldRegex("Subtype", @"Subtype:(?.+?)\|", 0, 1) }, + { "Feature", new FastaHeaderFieldRegex("Feature", @"Feature:(?.+?)\|", 0, 1) }, + { "Organism", new FastaHeaderFieldRegex("Organism", @"Species:(?.+?)$", 0, 1) }, + { "Cellular Localization", new FastaHeaderFieldRegex("CellularLocalization", @"Cellular_Localization:(?.+?)\|", 0, 1) }, + }; + + #endregion + + /// + /// Loads an RNA file from the specified location, optionally generating decoys and adding error tracking + /// + /// The file path to the RNA FASTA database + /// Flag indicating whether to generate targets or not + /// The type of decoy generation to apply + /// Indicates if the RNA sequence is a contaminant + /// Outputs any errors encountered during the process + /// An optional 5' prime chemical modification term + /// An optional 3' prime chemical modification term + /// A list of RNA sequences loaded from the FASTA database + /// Thrown if the FASTA header format is unknown or other issues occur during loading. + + public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, DecoyType decoyType, + bool isContaminant, out List errors, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null) + { + RnaFastaHeaderType? headerType = null; + Regex substituteWhitespace = new Regex(@"\s+"); + errors = new List(); + List targets = new List(); + string identifierHeader = null; + + string name = null; + string organism = null; + string identifier = null; + + string newDbLocation = rnaDbLocation; + + //we had trouble decompressing and streaming on the fly so we decompress completely first, then stream the file, then delete the decompressed file + if (rnaDbLocation.EndsWith(".gz")) + { + newDbLocation = Path.Combine(Path.GetDirectoryName(rnaDbLocation), "temp.fasta"); + using var stream = new FileStream(rnaDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read); + using FileStream outputFileStream = File.Create(newDbLocation); + using var decompressor = new GZipStream(stream, CompressionMode.Decompress); + decompressor.CopyTo(outputFileStream); + } + + using (var fastaFileStream = new FileStream(newDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + StringBuilder sb = null; + StreamReader fasta = new StreamReader(fastaFileStream); + Dictionary regexResults = new(); + Dictionary regexes = null; + + while (true) + { + string line = ""; + line = fasta.ReadLine(); + if (line == null) { break; } + + if (line.StartsWith(">")) + { + if (headerType is null) + { + headerType = DetectRnaFastaHeaderType(line); + + switch (headerType) + { + case RnaFastaHeaderType.Modomics: + regexes = ModomicsFieldRegexes; + identifierHeader = "SOterm"; + break; + default: + throw new MzLibUtil.MzLibException("Unknown fasta header format: " + line); + } + } + + + regexResults = ParseRegexFields(line, regexes); + name = regexResults["Name"]; + regexResults.Remove("Name"); + organism = regexResults["Organism"]; + regexResults.Remove("Organism"); + identifier = regexResults[identifierHeader]; + regexResults.Remove(identifierHeader); + + sb = new StringBuilder(); + } + else + { + sb?.Append(line.Trim()); + } + + if ((fasta.Peek() == '>' || fasta.Peek() == -1) /*&& accession != null*/ && sb != null) + { + string sequence = substituteWhitespace.Replace(sb.ToString(), ""); + Dictionary additonalDatabaseFields = + regexResults.ToDictionary(x => x.Key, x => x.Value); + + // Do we need to sanitize the sequence? + + RNA rna = new RNA(sequence, name, identifier, organism, rnaDbLocation, + fivePrimeTerm, threePrimeTerm, null, + isContaminant, false, null, additonalDatabaseFields); + if (rna.Length == 0) + errors.Add("Line" + line + ", Rna length of 0: " + rna.Name + "was skipped from database: " + rnaDbLocation); + else + targets.Add(rna); + + name = null; + organism = null; + identifier = null; + regexResults.Clear(); + } + + // no input left + if (fasta.Peek() == -1) + { + break; + } + } + } + + if (newDbLocation != rnaDbLocation) + File.Delete(newDbLocation); + + if (!targets.Any()) + errors.Add("No targets were loaded from database: " + rnaDbLocation); + + List decoys = RnaDecoyGenerator.GenerateDecoys(targets, decoyType); + return generateTargets ? targets.Concat(decoys).ToList() : decoys; + } + + + + private static Dictionary ParseRegexFields(string line, + Dictionary regexes) + { + Dictionary fields = new Dictionary(); + + foreach (var regex in regexes) + { + string match = regex.Value.ApplyRegex(line); + fields.Add(regex.Key, match); + } + + return fields; + } + + public static Dictionary> IdToPossibleMods = new Dictionary>(); + public static Dictionary IdWithMotifToMod = new Dictionary(); + + public static List LoadRnaXML(string rnaDbLocation, bool generateTargets, DecoyType decoyType, + bool isContaminant, IEnumerable allKnownModifications, + IEnumerable modTypesToExclude, out Dictionary unknownModifications, + int maxThreads = 1, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null) + { + var prespecified = ProteinDbLoader.GetPtmListFromProteinXml(rnaDbLocation); + allKnownModifications = allKnownModifications ?? new List(); + modTypesToExclude = modTypesToExclude ?? new List(); + + if (prespecified.Count > 0 || allKnownModifications.Count() > 0) + { + //modsDictionary = GetModificationDict(new HashSet(prespecified.Concat(allKnownModifications))); + IdToPossibleMods = ProteinDbLoader.GetModificationDict(new HashSet(prespecified.Concat(allKnownModifications))); + IdWithMotifToMod = ProteinDbLoader.GetModificationDictWithMotifs(new HashSet(prespecified.Concat(allKnownModifications))); + } + List targets = new List(); + unknownModifications = new Dictionary(); + + string newProteinDbLocation = rnaDbLocation; + + //we had trouble decompressing and streaming on the fly so we decompress completely first, then stream the file, then delete the decompressed file + if (rnaDbLocation.EndsWith(".gz")) + { + newProteinDbLocation = Path.Combine(Path.GetDirectoryName(rnaDbLocation), "temp.xml"); + using var stream = new FileStream(rnaDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read); + using FileStream outputFileStream = File.Create(newProteinDbLocation); + using var decompressor = new GZipStream(stream, CompressionMode.Decompress); + decompressor.CopyTo(outputFileStream); + } + + using (var uniprotXmlFileStream = new FileStream(newProteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + Regex substituteWhitespace = new Regex(@"\s+"); + + ProteinXmlEntry block = new ProteinXmlEntry(); + + using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream)) + { + while (xml.Read()) + { + if (xml.NodeType == XmlNodeType.Element) + { + block.ParseElement(xml.Name, xml); + } + if (xml.NodeType == XmlNodeType.EndElement || xml.IsEmptyElement) + { + RNA newProtein = block.ParseRnaEndElement(xml, modTypesToExclude, unknownModifications, isContaminant, rnaDbLocation); + if (newProtein != null) + { + targets.Add(newProtein); + } + } + } + } + } + if (newProteinDbLocation != rnaDbLocation) + { + File.Delete(newProteinDbLocation); + } + + List decoys = RnaDecoyGenerator.GenerateDecoys(targets, decoyType, maxThreads); + IEnumerable proteinsToExpand = generateTargets ? targets.Concat(decoys) : decoys; + return proteinsToExpand.ToList(); + } + } +} diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs new file mode 100644 index 000000000..b9cc20e1d --- /dev/null +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs @@ -0,0 +1,89 @@ +using Proteomics; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using MassSpectrometry; +using Omics.Modifications; +using Transcriptomics; + +namespace UsefulProteomicsDatabases.Transcriptomics +{ + /// + /// Provides methods for generating decoy nucleic acids from any implementor of . + /// + /// + /// This class supports various types of decoy generation, including reversing, sliding, and shuffling sequences. + /// It allows for the creation of decoy sequences while preserving certain characteristics such as modification sites and termini. + /// The GenerateDecoys method serves as the main entry point, delegating to specific decoy generation methods based on the specified . + /// TODO: Implement Shuffle and Slide Decoys + /// TODO: Consider passing digestion motif as optional parameter to leave digestion sites intact. Currently leaving the 3' intact as it is the predominant cleavage motif. + /// TODO: Consider palindromic sequences and the result they have on fragment ions (d/z are identical, c/y are identical). This will be particularly important for slided decoys + /// + public static class RnaDecoyGenerator + { + public static List GenerateDecoys(List nucleicAcids, DecoyType decoyType, int maxThreads = -1) where T : INucleicAcid + { + switch (decoyType) + { + case DecoyType.None: + return new List(); + case DecoyType.Reverse: + return GenerateReverseDecoys(nucleicAcids, maxThreads); + case DecoyType.Slide: + return GenerateSlidedDecoys(nucleicAcids, maxThreads); + case DecoyType.Shuffle: + return GenerateShuffledDeocys(nucleicAcids, maxThreads); + case DecoyType.Random: + default: + throw new ArgumentOutOfRangeException(nameof(decoyType), decoyType, null); + } + } + + /// + /// Generated decoys in which the sequence is reversed, + /// leaving modification on their nucleic acid of origin, + /// and 3' termini intact as it is the most likely cleavage site. + /// + /// + /// + /// + private static List GenerateReverseDecoys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + List decoyNucleicAcids = new List(); + Parallel.ForEach(nucleicAcids, new ParallelOptions() { MaxDegreeOfParallelism = maxThreads }, nucleicAcid => + { + // reverse sequence + var reverseSequence = + new string(nucleicAcid.BaseSequence[..^1].Reverse().Append(nucleicAcid.BaseSequence.Last()).ToArray()); + + // reverse modifications + var reverseModifications = new Dictionary>(); + foreach (var kvp in nucleicAcid.OneBasedPossibleLocalizedModifications) + { + var reverseKey = kvp.Key == reverseSequence.Length ? kvp.Key : reverseSequence.Length - kvp.Key; + reverseModifications.Add(reverseKey, kvp.Value); + } + + T newNucleicAcid = nucleicAcid.CreateNew(reverseSequence, reverseModifications, true); + lock (decoyNucleicAcids) + { + decoyNucleicAcids.Add(newNucleicAcid); + } + }); + return decoyNucleicAcids; + } + + private static List GenerateSlidedDecoys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + throw new NotImplementedException(); + } + + private static List GenerateShuffledDeocys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + throw new NotImplementedException(); + } + + } +} diff --git a/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj b/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj index 590324de2..f057fc396 100644 --- a/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj +++ b/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj @@ -1,7 +1,7 @@  - net6.0 + net8.0 x64 @@ -11,13 +11,15 @@ - + + + diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 5d5400e95..3aa393afe 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -12,20 +12,24 @@ Library for mass spectrometry projects. Chemistry Spectrometry - - - + + + + + - - - + + + + + @@ -37,48 +41,48 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/mzLib/mzLib.sln.DotSettings b/mzLib/mzLib.sln.DotSettings index 78477fa52..6c67babc8 100644 --- a/mzLib/mzLib.sln.DotSettings +++ b/mzLib/mzLib.sln.DotSettings @@ -1,11 +1,17 @@ - + + True True True True True True + True + True + True True + True True True + True True True \ No newline at end of file diff --git a/mzLib/mzPlot/mzPlot.csproj b/mzLib/mzPlot/mzPlot.csproj index b8de36731..ec7550390 100644 --- a/mzLib/mzPlot/mzPlot.csproj +++ b/mzLib/mzPlot/mzPlot.csproj @@ -1,11 +1,12 @@  - net6.0-windows + net8.0-windows x64 + diff --git a/mzLib/pepXML/pepXML.csproj b/mzLib/pepXML/pepXML.csproj index e77e73219..aff6dfbfa 100644 --- a/mzLib/pepXML/pepXML.csproj +++ b/mzLib/pepXML/pepXML.csproj @@ -1,7 +1,7 @@ - net6.0 + net8.0 x64 @@ -9,5 +9,9 @@ full true + + + +