diff --git a/.github/workflows/dotnet.yml b/.github/workflows/dotnet.yml index f89904dc3..dca2c6509 100644 --- a/.github/workflows/dotnet.yml +++ b/.github/workflows/dotnet.yml @@ -38,3 +38,65 @@ jobs: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} verbose: true files: mzLib/Test*/TestResults/*/coverage.cobertura.xml + integration: + runs-on: windows-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@v2 + - name: Set up .NET + uses: actions/setup-dotnet@v1 + with: + dotnet-version: 8.0.x + - name: Restore dependencies + run: cd mzLib && dotnet restore + - name: Build + run: cd mzLib && dotnet build --no-restore --configuration Release + - name: Change mzLib version, pack, add source + run: | + cd mzLib; + (Get-Content mzLib.nuspec) -replace "\(.*)\", "9.9.9" | Set-Content mzLib.nuspec; + $mzlibMatch = Select-String -Path mzLib.nuspec -Pattern "(?<=\)(.*)(?=\ RoundedDouble(myNumber as double?, places); public static double? RoundedDouble(this double? myNumber, int places = 9) { if (myNumber != null) diff --git a/mzLib/Development/Deconvolution/StandardDeconvolutionTest.cs b/mzLib/Development/DeconvolutionDevelopment/StandardDeconvolutionTest.cs similarity index 56% rename from mzLib/Development/Deconvolution/StandardDeconvolutionTest.cs rename to mzLib/Development/DeconvolutionDevelopment/StandardDeconvolutionTest.cs index ea02b6dd4..002e0f9b5 100644 --- a/mzLib/Development/Deconvolution/StandardDeconvolutionTest.cs +++ b/mzLib/Development/DeconvolutionDevelopment/StandardDeconvolutionTest.cs @@ -38,117 +38,152 @@ static StandardDeconvolutionTest() Loaders.LoadElements(); // define paths to spectra - var ubiquitinPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Deconvolution", "TestData", + var ubiquitinPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DeconvolutionDevelopment", "TestData", "Averaged_221110_UbiqOnly.mzML"); - var hghPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Deconvolution", "TestData", + var hghPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DeconvolutionDevelopment", "TestData", "Averaged_221110_HGHOnly.mzML"); - var cytoPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Deconvolution", "TestData", + var cytoPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DeconvolutionDevelopment", "TestData", "Averaged_221110_CytoOnly.mzML"); // set up deconvoluters to be utilized by test cases - DeconvolutionParameters classicTopDownDeconvolutionParams = new ClassicDeconvolutionParameters(1, 60, 4, 3); - DeconvolutionParameters classicBottomUpDeconvolutionParams = new ClassicDeconvolutionParameters(1, 12, 4, 3); + List topDownDeconvolutionParametersToTest = + [ + new ClassicDeconvolutionParameters(1, 60, 4, 3), + new IsoDecDeconvolutionParameters() + ]; - // Add Individual peak test cases - List singlePeakDeconvolutionTestCases = new() + List bottomUpDeconvolutionParametersToTest = + [ + new ClassicDeconvolutionParameters(1, 12, 4, 3), + new IsoDecDeconvolutionParameters() + ]; + + + + // Add Individual peak test cases for top down + List singlePeakDeconvolutionTestCases = new(); + foreach (var deconParams in topDownDeconvolutionParametersToTest) { // uniquitin, direct injection - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10038.4, 8, 1254.8, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10039.41, 9, 1115.49, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10041.4, 10, 1004.14, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10041.46, 11, 912.86, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10043.4, 12, 836.95, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10043.41, 13, 772.57, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10044.44, 14, 717.46, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10045.5, 15, 669.70, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", - ubiquitinPath, 1, 10045.44, 16, 627.84, 20), + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10038.4, 8, 1254.8, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10039.41, 9, 1115.49, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10041.4, 10, 1004.14, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10041.46, 11, 912.86, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10043.4, 12, 836.95, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10043.41, 13, 772.57, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10044.44, 14, 717.46, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10045.5, 15, 669.70, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection PolyUbiquitin, Averaged", + ubiquitinPath, 1, 10045.44, 16, 627.84, 20)); // hgh, direct injection - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", - hghPath, 1, 22139.41, 11, 2012.29, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + hghPath, 1, 22139.41, 11, 2012.29, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", - hghPath, 1, 22136.28, 12, 1844.69, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + hghPath, 1, 22136.28, 12, 1844.69, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", - hghPath, 1, 22137.31, 13, 1702.87, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + hghPath, 1, 22137.31, 13, 1702.87, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", - hghPath, 1, 22139.32, 14, 1581.38, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + hghPath, 1, 22139.32, 14, 1581.38, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", - hghPath, 1, 22139.25, 15, 1475.95, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + hghPath, 1, 22139.25, 15, 1475.95, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injectio Human Growth Hormone, Averaged", - hghPath, 1, 22140.32, 16, 1383.77, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + hghPath, 1, 22140.32, 16, 1383.77, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", - hghPath, 1, 22141.31, 17, 1302.43, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + hghPath, 1, 22141.31, 17, 1302.43, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", - hghPath, 1, 22142.34, 18, 1230.13, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, + hghPath, 1, 22142.34, 18, 1230.13, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", - hghPath, 1, 22143.36, 19, 1165.44, 20), + hghPath, 1, 22143.36, 19, 1165.44, 20)); // cytochrome c, direct injection - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12367.44, 9, 1374.16, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12367.4, 10, 1236.74, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12368.4, 11, 1124.40, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12370.44, 12, 1030.87, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12371.45, 13, 951.65, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12373.48, 14, 883.82, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12373.5, 15, 824.90, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12374.56, 16, 773.41, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12374.47, 17, 727.91, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12376.44, 18, 687.58, 20), - new SinglePeakDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", - cytoPath, 1, 12360.6, 20, 619.03, 20) - }; + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12367.44, 9, 1374.16, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12367.4, 10, 1236.74, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12368.4, 11, 1124.40, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12370.44, 12, 1030.87, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12371.45, 13, 951.65, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12373.48, 14, 883.82, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12373.5, 15, 824.90, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12374.56, 16, 773.41, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12374.47, 17, 727.91, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12376.44, 18, 687.58, 20)); + singlePeakDeconvolutionTestCases.Add(new SinglePeakDeconvolutionTestCase(deconParams, + "Direct Injection Cytochrome C, Averaged", + cytoPath, 1, 12360.6, 20, 619.03, 20)); + } _singlePeakTestCases = singlePeakDeconvolutionTestCases; - // Add whole spectrum test cases - List wholeSpectrumDeconvolutionTestCases = new() + // Add whole spectrum test cases for top down + List wholeSpectrumDeconvolutionTestCases = new(); + foreach (var deconParams in topDownDeconvolutionParametersToTest) { - new WholeSpectrumDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection PolyUbiquitin, Averaged", + wholeSpectrumDeconvolutionTestCases.Add(new WholeSpectrumDeconvolutionTestCase(deconParams, "Direct Injection PolyUbiquitin, Averaged", ubiquitinPath, 1, 20, new[] { 10038.4, 10039.41, 10041.4, 10041.46, 10043.4, 10043.41, 10044.44, 10045.5, 10045.44, }, new[] { 8, 9, 10, 11, 12, 13, 14, 15, 16 }, - new[] { 1254.8, 1115.49, 1004.14, 912.86, 836.95, 772.57, 717.46, 669.70, 627.84 }), + new[] { 1254.8, 1115.49, 1004.14, 912.86, 836.95, 772.57, 717.46, 669.70, 627.84 })); - new WholeSpectrumDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Human Growth Hormone, Averaged", + wholeSpectrumDeconvolutionTestCases.Add(new WholeSpectrumDeconvolutionTestCase(deconParams, "Direct Injection Human Growth Hormone, Averaged", hghPath, 1, 20, new []{22139.41, 22136.28, 22137.31, 22139.32, 22139.25, 22140.32, 22141.31, 22142.34, 22143.36}, new []{11, 12, 13, 14, 15, 16, 17, 18, 19}, - new []{2012.29, 1844.69, 1702.87, 1581.38, 1475.95, 1383.77, 1302.43, 1230.13, 1165.44}), + new []{2012.29, 1844.69, 1702.87, 1581.38, 1475.95, 1383.77, 1302.43, 1230.13, 1165.44})); - new WholeSpectrumDeconvolutionTestCase(classicTopDownDeconvolutionParams, "Direct Injection Cytochrome C, Averaged", + wholeSpectrumDeconvolutionTestCases.Add(new WholeSpectrumDeconvolutionTestCase(deconParams, "Direct Injection Cytochrome C, Averaged", cytoPath, 1, 20, new []{12367.44, 12367.4, 12368.4, 12370.44, 12371.45, 12373.48, 12373.5, 12374.56, 12374.47, 12376.44, 12360.6}, new []{9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20}, - new []{1374.16, 1236.74, 1124.40, 1030.87, 951.65, 883.82, 824.90, 773.41, 727.91, 687.58, 619.03}), + new []{1374.16, 1236.74, 1124.40, 1030.87, 951.65, 883.82, 824.90, 773.41, 727.91, 687.58, 619.03})); }; _wholeSpectrumDeconvolutionTestCases = wholeSpectrumDeconvolutionTestCases; + + // TODO: Add cases for bottom up deconvolution } #endregion diff --git a/mzLib/Development/Deconvolution/TestCases/SinglePeakDeconvolutionTestCase.cs b/mzLib/Development/DeconvolutionDevelopment/TestCases/SinglePeakDeconvolutionTestCase.cs similarity index 98% rename from mzLib/Development/Deconvolution/TestCases/SinglePeakDeconvolutionTestCase.cs rename to mzLib/Development/DeconvolutionDevelopment/TestCases/SinglePeakDeconvolutionTestCase.cs index 2bcf25db8..4e63f2392 100644 --- a/mzLib/Development/Deconvolution/TestCases/SinglePeakDeconvolutionTestCase.cs +++ b/mzLib/Development/DeconvolutionDevelopment/TestCases/SinglePeakDeconvolutionTestCase.cs @@ -23,7 +23,7 @@ public class SinglePeakDeconvolutionTestCase public SinglePeakDeconvolutionTestCase(DeconvolutionParameters deconParameters, string sampleInformation, string spectrumPath, int scanNumber, double expectedMostAbundantObservedIsotopicMass, int expectedIonChargeState, double selectedIonMz, double precursorPpmMassTolerance) { - + DeconvolutionParameters = deconParameters; SampleInformation = sampleInformation; ExpectedMostAbundantObservedIsotopicMass = expectedMostAbundantObservedIsotopicMass; ExpectedIonChargeState = expectedIonChargeState; diff --git a/mzLib/Development/Deconvolution/TestCases/TestDevelopmentTestCases.cs b/mzLib/Development/DeconvolutionDevelopment/TestCases/TestDevelopmentTestCases.cs similarity index 100% rename from mzLib/Development/Deconvolution/TestCases/TestDevelopmentTestCases.cs rename to mzLib/Development/DeconvolutionDevelopment/TestCases/TestDevelopmentTestCases.cs diff --git a/mzLib/Development/Deconvolution/TestCases/WholeSpectrumDeconvolutionTestCase.cs b/mzLib/Development/DeconvolutionDevelopment/TestCases/WholeSpectrumDeconvolutionTestCase.cs similarity index 100% rename from mzLib/Development/Deconvolution/TestCases/WholeSpectrumDeconvolutionTestCase.cs rename to mzLib/Development/DeconvolutionDevelopment/TestCases/WholeSpectrumDeconvolutionTestCase.cs diff --git a/mzLib/Development/Deconvolution/TestData/Averaged_221110_CytoOnly.mzML b/mzLib/Development/DeconvolutionDevelopment/TestData/Averaged_221110_CytoOnly.mzML similarity index 100% rename from mzLib/Development/Deconvolution/TestData/Averaged_221110_CytoOnly.mzML rename to mzLib/Development/DeconvolutionDevelopment/TestData/Averaged_221110_CytoOnly.mzML diff --git a/mzLib/Development/Deconvolution/TestData/Averaged_221110_HGHOnly.mzML b/mzLib/Development/DeconvolutionDevelopment/TestData/Averaged_221110_HGHOnly.mzML similarity index 100% rename from mzLib/Development/Deconvolution/TestData/Averaged_221110_HGHOnly.mzML rename to mzLib/Development/DeconvolutionDevelopment/TestData/Averaged_221110_HGHOnly.mzML diff --git a/mzLib/Development/Deconvolution/TestData/Averaged_221110_UbiqOnly.mzML b/mzLib/Development/DeconvolutionDevelopment/TestData/Averaged_221110_UbiqOnly.mzML similarity index 100% rename from mzLib/Development/Deconvolution/TestData/Averaged_221110_UbiqOnly.mzML rename to mzLib/Development/DeconvolutionDevelopment/TestData/Averaged_221110_UbiqOnly.mzML diff --git a/mzLib/Development/Development.csproj b/mzLib/Development/Development.csproj index 0f0eaf199..cd65a3163 100644 --- a/mzLib/Development/Development.csproj +++ b/mzLib/Development/Development.csproj @@ -23,13 +23,13 @@ - + PreserveNewest - + PreserveNewest - + PreserveNewest diff --git a/mzLib/FlashLFQ/ChromatographicPeak.cs b/mzLib/FlashLFQ/ChromatographicPeak.cs index 9041eab66..d7bac2195 100644 --- a/mzLib/FlashLFQ/ChromatographicPeak.cs +++ b/mzLib/FlashLFQ/ChromatographicPeak.cs @@ -1,9 +1,9 @@ -using MzLibUtil; -using System; +using System; using System.Collections.Generic; using System.Linq; using System.Text; using ClassExtensions = Chemistry.ClassExtensions; +using FlashLFQ.PEP; namespace FlashLFQ { @@ -16,20 +16,23 @@ public class ChromatographicPeak public int ScanCount => IsotopicEnvelopes.Count; public double SplitRT; public readonly bool IsMbrPeak; - public double PredictedRetentionTime { get; init; } - + public double MbrScore; + public double PpmScore { get; set; } + public double IntensityScore { get; set; } + public double RtScore { get; set; } + public double ScanCountScore { get; set; } + public double IsotopicDistributionScore { get; set; } /// - /// A score bounded by 100 and 0, with more confident MBR-detections receiving higher scores + /// Stores the pearson correlation between the apex isotopic envelope and the theoretical isotopic distribution /// - public double MbrScore { get; private set; } - - /// The four scores below are bounded by 0 and 1, with higher scores being better - public double PpmScore { get; private set; } - public double IntensityScore { get; private set; } - public double RtScore { get; private set; } - public double ScanCountScore { get; private set; } - - public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo) + public double IsotopicPearsonCorrelation => Apex?.PearsonCorrelation ?? -1; + public double RtPredictionError { get; set; } + public List ChargeList { get; set; } + internal double MbrQValue { get; set; } + public ChromatographicPeakData PepPeakData { get; set; } + public double? MbrPep { get; set; } + + public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, bool randomRt = false) { SplitRT = 0; NumChargeStatesObserved = 0; @@ -40,12 +43,14 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi IsotopicEnvelopes = new List(); IsMbrPeak = isMbrPeak; SpectraFileInfo = fileInfo; + RandomRt = randomRt; } - public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fileInfo, double predictedRetentionTime) : - this(id, isMbrPeak, fileInfo) + public bool Equals(ChromatographicPeak peak) { - PredictedRetentionTime = predictedRetentionTime; + return SpectraFileInfo.Equals(peak.SpectraFileInfo) + && Identifications.First().ModifiedSequence.Equals(peak.Identifications.First().ModifiedSequence) + && ApexRetentionTime == peak.ApexRetentionTime; } public IsotopicEnvelope Apex { get; private set; } @@ -54,13 +59,18 @@ public ChromatographicPeak(Identification id, bool isMbrPeak, SpectraFileInfo fi public int NumIdentificationsByBaseSeq { get; private set; } public int NumIdentificationsByFullSeq { get; private set; } public double MassError { get; private set; } + /// + /// Bool that describes whether the retention time of this peak was randomized + /// If true, implies that this peak is a decoy peak identified by the MBR algorithm + /// + public bool RandomRt { get; } + public bool DecoyPeptide => Identifications.First().IsDecoy; public void CalculateIntensityForThisFeature(bool integrate) { if (IsotopicEnvelopes.Any()) { - double maxIntensity = IsotopicEnvelopes.Max(p => p.Intensity); - Apex = IsotopicEnvelopes.First(p => p.Intensity == maxIntensity); + Apex = IsotopicEnvelopes.MaxBy(p => p.Intensity); if (integrate) { @@ -123,25 +133,6 @@ public void ResolveIdentifications() this.NumIdentificationsByBaseSeq = Identifications.Select(v => v.BaseSequence).Distinct().Count(); this.NumIdentificationsByFullSeq = Identifications.Select(v => v.ModifiedSequence).Distinct().Count(); } - - /// - /// Calculates four component scores and one overarching Mbr score for an MBR peak. - /// MBR Score is equal to 100 * the geometric mean of the four component scores. - /// - /// An MbrScorer specific to the file where this peak was found - /// The donor peak used as the basis for the MBR identification. - internal void CalculateMbrScore(MbrScorer scorer, ChromatographicPeak donorPeak) - { - if (SpectraFileInfo != scorer.AcceptorFile) throw new MzLibException("Error when performing match-between-runs: Mismatch between scorer and peak."); - - IntensityScore = scorer.CalculateIntensityScore(this, donorPeak); - RtScore = scorer.CalculateRetentionTimeScore(this, donorPeak); - PpmScore = scorer.CalculatePpmErrorScore(this); - ScanCountScore = scorer.CalculateScanCountScore(this); - - MbrScore = 100 * Math.Pow(IntensityScore * RtScore * PpmScore * ScanCountScore, 0.25); - } - public static string TabSeparatedHeader { get @@ -164,20 +155,18 @@ public static string TabSeparatedHeader sb.Append("Peak Charge" + "\t"); sb.Append("Num Charge States Observed" + "\t"); sb.Append("Peak Detection Type" + "\t"); - sb.Append("MBR Score" + "\t"); - sb.Append("Ppm Score" + "\t"); - sb.Append("Intensity Score" + "\t"); - sb.Append("Rt Score" + "\t"); - sb.Append("Scan Count Score" + "\t"); + sb.Append("PIP Q-Value" + "\t"); + sb.Append("PIP PEP" + "\t"); sb.Append("PSMs Mapped" + "\t"); sb.Append("Base Sequences Mapped" + "\t"); sb.Append("Full Sequences Mapped" + "\t"); sb.Append("Peak Split Valley RT" + "\t"); - sb.Append("Peak Apex Mass Error (ppm)"); + sb.Append("Peak Apex Mass Error (ppm)" + "\t"); + sb.Append("Decoy Peptide" + "\t"); + sb.Append("Random RT"); return sb.ToString(); } } - public override string ToString() { StringBuilder sb = new StringBuilder(); @@ -260,17 +249,16 @@ public override string ToString() sb.Append("" + "MSMS" + "\t"); } - sb.Append("" + (IsMbrPeak ? MbrScore.ToString() : "") + "\t"); - sb.Append("" + (IsMbrPeak ? PpmScore.ToString() : "") + "\t"); - sb.Append("" + (IsMbrPeak ? IntensityScore.ToString() : "") + "\t"); - sb.Append("" + (IsMbrPeak ? RtScore.ToString() : "") + "\t"); - sb.Append("" + (IsMbrPeak ? ScanCountScore.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? MbrQValue.ToString() : "") + "\t"); + sb.Append("" + (IsMbrPeak ? MbrPep.ToString() : "") + "\t"); sb.Append("" + Identifications.Count + "\t"); sb.Append("" + NumIdentificationsByBaseSeq + "\t"); sb.Append("" + NumIdentificationsByFullSeq + "\t"); sb.Append("" + SplitRT + "\t"); sb.Append("" + MassError); + sb.Append("\t" + DecoyPeptide); + sb.Append("\t" + RandomRt); return sb.ToString(); } diff --git a/mzLib/FlashLFQ/FlashLFQ.csproj b/mzLib/FlashLFQ/FlashLFQ.csproj index 097bf6131..d5c466967 100644 --- a/mzLib/FlashLFQ/FlashLFQ.csproj +++ b/mzLib/FlashLFQ/FlashLFQ.csproj @@ -13,6 +13,8 @@ + + diff --git a/mzLib/FlashLFQ/FlashLFQResults.cs b/mzLib/FlashLFQ/FlashLFQResults.cs index 9e5f8e0e4..7e7f7bfb5 100644 --- a/mzLib/FlashLFQ/FlashLFQResults.cs +++ b/mzLib/FlashLFQ/FlashLFQResults.cs @@ -1,5 +1,7 @@ using Easy.Common.Extensions; using MathNet.Numerics.Statistics; +using MzLibUtil; +using Proteomics; using System; using System.Collections.Generic; using System.IO; @@ -14,28 +16,28 @@ public class FlashLfqResults public readonly Dictionary PeptideModifiedSequences; public readonly Dictionary ProteinGroups; public readonly Dictionary> Peaks; + public Dictionary ModInfo { get; private set; } private readonly HashSet _peptideModifiedSequencesToQuantify; + public string PepResultString { get; set; } + public double MbrQValueThreshold { get; set; } - public FlashLfqResults(List spectraFiles, List identifications, HashSet peptides = null) + public FlashLfqResults(List spectraFiles, List identifications, double mbrQValueThreshold = 0.05, + HashSet peptideModifiedSequencesToQuantify = null) { SpectraFiles = spectraFiles; PeptideModifiedSequences = new Dictionary(); ProteinGroups = new Dictionary(); Peaks = new Dictionary>(); - if(peptides == null || !peptides.Any()) - { - peptides = identifications.Select(id => id.ModifiedSequence).ToHashSet(); - } - _peptideModifiedSequencesToQuantify = peptides; + MbrQValueThreshold = mbrQValueThreshold; + _peptideModifiedSequencesToQuantify = peptideModifiedSequencesToQuantify ?? identifications.Where(id => !id.IsDecoy).Select(id => id.ModifiedSequence).ToHashSet(); foreach (SpectraFileInfo file in spectraFiles) { Peaks.Add(file, new List()); } - // Only quantify peptides within the set of valid peptide modified (full) sequences. This is done to enable pepitde-level FDR control of reported results - foreach (Identification id in identifications.Where(id => peptides.Contains(id.ModifiedSequence))) + foreach (Identification id in identifications.Where(id => !id.IsDecoy & _peptideModifiedSequencesToQuantify.Contains(id.ModifiedSequence))) { if (!PeptideModifiedSequences.TryGetValue(id.ModifiedSequence, out Peptide peptide)) { @@ -59,6 +61,17 @@ public FlashLfqResults(List spectraFiles, List } } + public void ReNormalizeResults(bool integrate = false, int maxThreads = 10, bool useSharedPeptides = false) + { + foreach(var peak in Peaks.SelectMany(p => p.Value)) + { + peak.CalculateIntensityForThisFeature(integrate); + } + new IntensityNormalizationEngine(this, integrate, silent: true, maxThreads).NormalizeResults(); + CalculatePeptideResults(quantifyAmbiguousPeptides: false); + CalculateProteinResultsMedianPolish(useSharedPeptides: useSharedPeptides); + } + public void MergeResultsWith(FlashLfqResults mergeFrom) { this.SpectraFiles.AddRange(mergeFrom.SpectraFiles); @@ -128,6 +141,8 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides) { var groupedPeaks = filePeaks.Value .Where(p => p.NumIdentificationsByFullSeq == 1) + .Where(p => !p.Identifications.First().IsDecoy) + .Where(p => !p.IsMbrPeak || (p.MbrQValue < MbrQValueThreshold && !p.RandomRt)) .GroupBy(p => p.Identifications.First().ModifiedSequence) .Where(group => _peptideModifiedSequencesToQuantify.Contains(group.Key)) .ToList(); @@ -163,11 +178,15 @@ public void CalculatePeptideResults(bool quantifyAmbiguousPeptides) // report ambiguous quantification var ambiguousPeaks = filePeaks.Value .Where(p => p.NumIdentificationsByFullSeq > 1) + .Where(p => !p.Identifications.First().IsDecoy) + .Where(p => !p.IsMbrPeak || (p.MbrQValue < MbrQValueThreshold && !p.RandomRt)) .ToList(); foreach (ChromatographicPeak ambiguousPeak in ambiguousPeaks) { - foreach (Identification id in ambiguousPeak.Identifications) + foreach (Identification id in ambiguousPeak.Identifications.Where(id => !id.IsDecoy)) { + if (!_peptideModifiedSequencesToQuantify.Contains(id.ModifiedSequence)) continue; // Ignore the ids/sequences we don't want to quantify + string sequence = id.ModifiedSequence; double alreadyRecordedIntensity = PeptideModifiedSequences[sequence].GetIntensity(filePeaks.Key); @@ -224,7 +243,7 @@ private void HandleAmbiguityInFractions() foreach (SpectraFileInfo file in sample) { - foreach (ChromatographicPeak peak in Peaks[file]) + foreach (ChromatographicPeak peak in Peaks[file].Where(p => !p.IsMbrPeak || p.MbrQValue < MbrQValueThreshold)) { foreach (Identification id in peak.Identifications) { @@ -331,6 +350,29 @@ public void CalculateProteinResultsTop3(bool useSharedPeptides) } } } + /// + /// Calculate peptide level ptm occupancy with either all peptides to be quantified (by intensity) or a subset of FlashLFQ-identified peptides with an arbitrary peptide-level quantifier. + /// + /// Dictionary where keys are string-typed peptide full sequences in PeptideModifiedSequences and the value is a double-typed quantifier of that peptide. + /// If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing). + /// If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid. + /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod + public void CalculatePTMOccupancy(Dictionary quantifiedPeptides=null, bool modOnNTerminus=true, bool modOnCTerminus=true) + { + quantifiedPeptides = quantifiedPeptides ?? new Dictionary { }; + + var peptides = _peptideModifiedSequencesToQuantify + .Where(pep => PeptideModifiedSequences.ContainsKey(pep)) + .Select(pep => Tuple.Create( + PeptideModifiedSequences[pep].Sequence, + PeptideModifiedSequences[pep].BaseSequence, + PeptideModifiedSequences[pep].ProteinGroups.Select(pg => pg.ProteinGroupName).ToList(), + quantifiedPeptides.GetValueOrDefault(pep, PeptideModifiedSequences[pep].GetTotalIntensity()))).ToList(); + + PositionFrequencyAnalysis pfa = new PositionFrequencyAnalysis(); + pfa.PeptidePTMOccupancy(peptides, modOnNTerminus, modOnCTerminus); + ModInfo = pfa.Occupancy; + } /// /// This method uses the median polish algorithm to calculate protein quantities in each biological replicate. @@ -549,7 +591,7 @@ public void CalculateProteinResultsMedianPolish(bool useSharedPeptides) } } - public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent = true) + public void WriteResults(string peaksOutputPath, string modPeptideOutputPath, string proteinOutputPath, string bayesianProteinQuantOutput, bool silent) { if (!silent) { diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 712eae6a1..b052ca9fd 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -12,11 +12,21 @@ using UsefulProteomicsDatabases; using System.Runtime.CompilerServices; using Easy.Common.Extensions; +using FlashLFQ.PEP; +using System.IO; +using System.Threading; [assembly: InternalsVisibleTo("TestFlashLFQ")] namespace FlashLFQ { + public enum DonorCriterion + { + Score, + Intensity, + Neighbors + } + public class FlashLfqEngine { // settings @@ -38,9 +48,24 @@ public class FlashLfqEngine public readonly bool MatchBetweenRuns; public readonly double MbrRtWindow; public readonly double MbrPpmTolerance; - public readonly bool RequireMsmsIdInCondition; + public readonly double MbrDetectionQValueThreshold; private int _numberOfAnchorPeptidesForMbr = 3; // the number of anchor peptides used for local alignment when predicting retention times of MBR acceptor peptides + // New MBR Settings + public readonly double RtWindowIncrease = 0; + public readonly double MbrAlignmentWindow = 2.5; + public readonly double PepTrainingFraction = 0.25; + /// + /// Specifies how the donor peak for MBR is selected. + /// 'Score' selects the donor peak associated with the highest scoring PSM + /// 'Intensity' selects the donor peak with the max intensity + /// 'Neighbors' selects the donor peak with the most neighboring peaks + /// + public DonorCriterion DonorCriterion { get; init; } + public readonly double DonorQValueThreshold; + public readonly bool RequireMsmsIdInCondition; + private int _randomSeed = 42; + // settings for the Bayesian protein quantification engine public readonly bool BayesianProteinQuant; @@ -61,7 +86,7 @@ public class FlashLfqEngine /// Other peptides may appear in the QuantifiedPeaks output, but this list is used to enable /// peptide-level FDR filtering /// - public HashSet PeptidesModifiedSequencesToQuantify { get; init; } + public HashSet PeptideModifiedSequencesToQuantify { get; init; } /// /// Dictionary linking a modified sequence to a List of tuples containing /// the mass shifts (isotope mass - monoisotopic mass) and normalized abundances for the @@ -72,6 +97,7 @@ public class FlashLfqEngine private FlashLfqResults _results; internal Dictionary _ms1Scans; internal PeakIndexingEngine _peakIndexingEngine; + internal Dictionary> DonorFileToPeakDict { get; private set; } /// /// Create an instance of FlashLFQ that will quantify peptides based on their precursor intensity in MS1 spectra @@ -95,8 +121,9 @@ public FlashLfqEngine( // MBR settings bool matchBetweenRuns = false, double matchBetweenRunsPpmTolerance = 10.0, - double maxMbrWindow = 2.5, + double maxMbrWindow = 1.0, bool requireMsmsIdInCondition = false, + double matchBetweenRunsFdrThreshold = 0.05, // settings for the Bayesian protein quantification engine bool bayesianProteinQuant = false, @@ -106,8 +133,10 @@ public FlashLfqEngine( int mcmcBurninSteps = 1000, bool useSharedPeptidesForProteinQuant = false, bool pairedSamples = false, - List peptideSequencesToUse = null, - int? randomSeed = null) + int? randomSeed = null, + DonorCriterion donorCriterion = DonorCriterion.Score, + double donorQValueThreshold = 0.01, + List peptideSequencesToQuantify = null) { Loaders.LoadElements(); @@ -122,17 +151,17 @@ public FlashLfqEngine( .ThenBy(p => p.TechnicalReplicate).ToList(); _allIdentifications = allIdentifications; + PeptideModifiedSequencesToQuantify = peptideSequencesToQuantify.IsNotNullOrEmpty() + ? new HashSet(peptideSequencesToQuantify) + : allIdentifications.Select(id => id.ModifiedSequence).ToHashSet(); PpmTolerance = ppmTolerance; IsotopePpmTolerance = isotopeTolerancePpm; - MatchBetweenRuns = matchBetweenRuns; - MbrPpmTolerance = matchBetweenRunsPpmTolerance; + Integrate = integrate; NumIsotopesRequired = numIsotopesRequired; QuantifyAmbiguousPeptides = quantifyAmbiguousPeptides; Silent = silent; IdSpecificChargeState = idSpecificChargeState; - MbrRtWindow = maxMbrWindow; - RequireMsmsIdInCondition = requireMsmsIdInCondition; Normalize = normalize; MaxThreads = maxThreads; @@ -143,8 +172,14 @@ public FlashLfqEngine( McmcSteps = mcmcSteps; McmcBurninSteps = mcmcBurninSteps; UseSharedPeptidesForProteinQuant = useSharedPeptidesForProteinQuant; - PeptidesModifiedSequencesToQuantify = peptideSequencesToUse.IsNotNullOrEmpty() ? new HashSet(peptideSequencesToUse) - : allIdentifications.Select(id => id.ModifiedSequence).ToHashSet(); + + // MBR settings + MatchBetweenRuns = matchBetweenRuns; + MbrPpmTolerance = matchBetweenRunsPpmTolerance; + MbrRtWindow = maxMbrWindow; + DonorCriterion = donorCriterion; + DonorQValueThreshold = donorQValueThreshold; + MbrDetectionQValueThreshold = matchBetweenRunsFdrThreshold; RandomSeed = randomSeed; if (MaxThreads == -1 || MaxThreads >= Environment.ProcessorCount) @@ -166,7 +201,7 @@ public FlashLfqResults Run() { _globalStopwatch.Start(); _ms1Scans = new Dictionary(); - _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, PeptidesModifiedSequencesToQuantify); + _results = new FlashLfqResults(_spectraFileInfo, _allIdentifications, MbrDetectionQValueThreshold, PeptideModifiedSequencesToQuantify); // build m/z index keys CalculateTheoreticalIsotopeDistributions(); @@ -206,6 +241,8 @@ public FlashLfqResults Run() // do MBR if (MatchBetweenRuns) { + Console.WriteLine("Find the best donors for match-between-runs"); + FindPeptideDonorFiles(); foreach (var spectraFile in _spectraFileInfo) { if (!Silent) @@ -214,7 +251,6 @@ public FlashLfqResults Run() } QuantifyMatchBetweenRunsPeaks(spectraFile); - _peakIndexingEngine.ClearIndex(); if (!Silent) @@ -222,6 +258,14 @@ public FlashLfqResults Run() Console.WriteLine("Finished MBR for " + spectraFile.FilenameWithoutExtension); } } + + Console.WriteLine("Computing PEP for MBR Transfers"); + bool pepSuccesful = RunPEPAnalysis(); + + foreach (var spectraFile in _spectraFileInfo) + { + CalculateFdrForMbrPeaks(spectraFile, pepSuccesful); + } } // normalize @@ -236,6 +280,9 @@ public FlashLfqResults Run() // do top3 protein quantification _results.CalculateProteinResultsMedianPolish(UseSharedPeptidesForProteinQuant); + // calculate ptm occupancy at the peptide level + _results.CalculatePTMOccupancy(); + // do Bayesian protein fold-change analysis if (BayesianProteinQuant) { @@ -489,52 +536,59 @@ private void QuantifyMs2IdentifiedPeptides(SpectraFileInfo fileInfo) _results.Peaks[fileInfo].AddRange(chromatographicPeaks.ToList()); } + #region MatchBetweenRuns /// /// Used by the match-between-runs algorithm to determine systematic retention time drifts between /// chromatographic runs. /// - private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor, MbrScorer scorer) - { + private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, SpectraFileInfo acceptor, MbrScorer scorer, + out List donorFileBestMsmsPeaksOrderedByMass) + { Dictionary donorFileBestMsmsPeaks = new(); Dictionary acceptorFileBestMsmsPeaks = new(); List rtCalibrationCurve = new(); List anchorPeptideRtDiffs = new(); // anchor peptides are peptides that were MS2 detected in both the donor and acceptor runs + Dictionary> donorFileAllMsmsPeaks = _results.Peaks[donor] + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && !peak.IsMbrPeak + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .ToDictionary(group => group.Key, group => group.ToList()); - // get all peaks, not counting ambiguous peaks - IEnumerable donorPeaks = _results.Peaks[donor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - IEnumerable acceptorPeaks = _results.Peaks[acceptor].Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1); - - // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in acceptorPeaks) + // iterate through each unique donor sequence + foreach (var sequencePeakListKvp in donorFileAllMsmsPeaks) { - if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > acceptorPeak.Intensity) - { - acceptorFileBestMsmsPeaks[acceptorPeak.Identifications.First().ModifiedSequence] = acceptorPeak; - } - } - else - { - acceptorFileBestMsmsPeaks.Add(acceptorPeak.Identifications.First().ModifiedSequence, acceptorPeak); - } + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + donorFileBestMsmsPeaks.Add(sequencePeakListKvp.Key, bestPeak); } - // get the best (most intense) peak for each peptide in the donor file - foreach (ChromatographicPeak donorPeak in donorPeaks) + Dictionary> acceptorFileAllMsmsPeaks = _results.Peaks[acceptor] + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && !peak.IsMbrPeak + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .ToDictionary(group => group.Key, group => group.ToList()); + + // iterate through each acceptor sequence + foreach (var sequencePeakListKvp in acceptorFileAllMsmsPeaks) { - if (donorFileBestMsmsPeaks.TryGetValue(donorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) - { - if (currentBestPeak.Intensity > donorPeak.Intensity) - { - donorFileBestMsmsPeaks[donorPeak.Identifications.First().ModifiedSequence] = donorPeak; - } - } - else - { - donorFileBestMsmsPeaks.Add(donorPeak.Identifications.First().ModifiedSequence, donorPeak); - } + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + acceptorFileBestMsmsPeaks.Add(sequencePeakListKvp.Key, bestPeak); } // create RT calibration curve @@ -545,7 +599,7 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec if (donorFileBestMsmsPeaks.TryGetValue(peak.Key, out ChromatographicPeak donorFilePeak)) { rtCalibrationCurve.Add(new RetentionTimeCalibDataPoint(donorFilePeak, acceptorFilePeak)); - if(donorFilePeak.ApexRetentionTime > 0 && acceptorFilePeak.ApexRetentionTime > 0) + if (donorFilePeak.ApexRetentionTime > 0 && acceptorFilePeak.ApexRetentionTime > 0) { anchorPeptideRtDiffs.Add(donorFilePeak.ApexRetentionTime - acceptorFilePeak.ApexRetentionTime); } @@ -553,10 +607,89 @@ private RetentionTimeCalibDataPoint[] GetRtCalSpline(SpectraFileInfo donor, Spec } scorer.AddRtPredErrorDistribution(donor, anchorPeptideRtDiffs, _numberOfAnchorPeptidesForMbr); + donorFileBestMsmsPeaksOrderedByMass = donorFileBestMsmsPeaks.Select(kvp => kvp.Value).OrderBy(p => p.Identifications.First().PeakfindingMass).ToList(); return rtCalibrationCurve.OrderBy(p => p.DonorFilePeak.Apex.IndexedPeak.RetentionTime).ToArray(); } + /// + /// For every MSMS identified peptide, selects one file that will be used as the donor + /// by finding files that contain the most peaks in the local neighborhood, + /// then writes the restults to the DonorFileToIdsDict. + /// WARNING! Strong assumption that this is called BEFORE MBR peaks are identified/assigned to the results + /// + private void FindPeptideDonorFiles() + { + DonorFileToPeakDict = new Dictionary>(); + + Dictionary> seqPeakDict = _results.Peaks + .SelectMany(kvp => kvp.Value) + .Where(peak => peak.NumIdentificationsByFullSeq == 1 + && peak.IsotopicEnvelopes.Any() + && peak.Identifications.Min(id => id.QValue) < DonorQValueThreshold) + .GroupBy(peak => peak.Identifications.First().ModifiedSequence) + .Where(group => PeptideModifiedSequencesToQuantify.Contains(group.Key)) + .ToDictionary(group => group.Key, group => group.ToList()); + + // iterate through each unique sequence + foreach (var sequencePeakListKvp in seqPeakDict) + { + List peaksForPeptide = sequencePeakListKvp.Value; + if (!peaksForPeptide.Any()) + continue; + + ChromatographicPeak bestPeak = ChooseBestPeak(peaksForPeptide); + + if (bestPeak == null) continue; + if (DonorFileToPeakDict.ContainsKey(bestPeak.SpectraFileInfo)) + { + DonorFileToPeakDict[bestPeak.SpectraFileInfo].Add(bestPeak); + } + else + { + DonorFileToPeakDict.Add(bestPeak.SpectraFileInfo, new List { bestPeak }); + } + } + } + + internal ChromatographicPeak ChooseBestPeak(List peaks) + { + ChromatographicPeak bestPeak = null; + switch (DonorCriterion) + { + case DonorCriterion.Score: // Select best peak by the PSM score + bestPeak = peaks.MaxBy(peak => peak.Identifications.Max(id => id.PsmScore)); + if (bestPeak.Identifications.First().PsmScore > 0) + break; + else // if every ID has a score of zero, let it fall through to the default case + goto default; + case DonorCriterion.Neighbors: // Select peak with the most neighboring peaks + int maxPeaks = 0; + foreach (var donorPeak in peaks) + { + // Count the number of neighboring peaks with unique peptides + int neighboringPeaksCount = _results.Peaks[donorPeak.SpectraFileInfo] + .Where(peak => Math.Abs(peak.ApexRetentionTime - donorPeak.ApexRetentionTime) < MbrAlignmentWindow) + .Select(peak => peak.Identifications.First().ModifiedSequence) + .Distinct() + .Count(); + + if (neighboringPeaksCount > maxPeaks) + { + maxPeaks = neighboringPeaksCount; + bestPeak = donorPeak; + } + } + break; + case DonorCriterion.Intensity: // Select the peak with the highest intensity + default: + bestPeak = peaks.MaxBy(peak => peak.Intensity); + break; + } + + return bestPeak; + } + /// /// Used by MBR. Predicts the retention time of a peak in an acceptor file based on the /// retention time of the peak in the donor file. This is done with a local alignment @@ -601,21 +734,21 @@ internal RtInfo PredictRetentionTime( int numberOfForwardAnchors = 0; // gather nearby data points - for (int r = index+1; r < rtCalibrationCurve.Length; r++) + for (int r = index + 1; r < rtCalibrationCurve.Length; r++) { double rtDiff = rtCalibrationCurve[r].DonorFilePeak.Apex.IndexedPeak.RetentionTime - donorPeak.Apex.IndexedPeak.RetentionTime; - if (rtCalibrationCurve[r].AcceptorFilePeak != null + if (rtCalibrationCurve[r].AcceptorFilePeak != null && rtCalibrationCurve[r].AcceptorFilePeak.ApexRetentionTime > 0) { - if(Math.Abs(rtDiff) > 0.5) // If the rtDiff is too large, it's no longer local alignment + if (Math.Abs(rtDiff) > 0.5) // If the rtDiff is too large, it's no longer local alignment { break; } nearbyCalibrationPoints.Add(rtCalibrationCurve[r]); numberOfForwardAnchors++; - if(numberOfForwardAnchors >= _numberOfAnchorPeptidesForMbr) // We only want a handful of anchor points + if (numberOfForwardAnchors >= _numberOfAnchorPeptidesForMbr) // We only want a handful of anchor points { - break; + break; } } } @@ -642,20 +775,25 @@ internal RtInfo PredictRetentionTime( if (!nearbyCalibrationPoints.Any()) { - return null; + // If there are no nearby calibration points, return the donor peak's RT and a width of 15 seconds + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime, width: 0.25); } // calculate difference between acceptor and donor RTs for these RT region List rtDiffs = nearbyCalibrationPoints - .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) + .Select(p => p.DonorFilePeak.ApexRetentionTime - p.AcceptorFilePeak.ApexRetentionTime) .ToList(); - + double medianRtDiff = rtDiffs.Median(); - double rtRange = rtDiffs.InterquartileRange() * 4.5; - // IQR * 4.5 is roughly equivalent to 6 StdDevs, so search window extends ~3 std.devs from either side of predicted RT - // IQR is less affected by outliers than StdDev + if(rtDiffs.Count == 1) + { + // If there are no nearby calibration points, return the donor peak's RT and a width of 15 seconds + return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: 0.25); + } + + double rtRange = rtDiffs.StandardDeviation() * 6; - rtRange = Math.Min(rtRange, MbrRtWindow); + rtRange = Math.Min(rtRange, MbrRtWindow); return new RtInfo(predictedRt: donorPeak.Apex.IndexedPeak.RetentionTime - medianRtDiff, width: rtRange); } @@ -672,7 +810,8 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie var apexToAcceptorFilePeakDict = new Dictionary(); List ppmErrors = new List(); foreach (var peak in acceptorFileIdentifiedPeaks.Where(p => p.Apex != null - && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence))) + && PeptideModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) + && p.Identifications.First().QValue < DonorQValueThreshold)) { if (!apexToAcceptorFilePeakDict.ContainsKey(peak.Apex.IndexedPeak)) { @@ -702,6 +841,56 @@ private MbrScorer BuildMbrScorer(List acceptorFileIdentifie return new MbrScorer(apexToAcceptorFilePeakDict, acceptorFileIdentifiedPeaks, ppmDistribution, logIntensityDistribution); } + /// + /// Returns a pseudo-randomly selected peak that does not have the same mass as the donor + /// + /// + /// Will search for a peak at least 5 Da away from the peakfinding mass + /// + internal ChromatographicPeak GetRandomPeak( + List peaksOrderedByMass, + double donorPeakRetentionTime, + double retentionTimeMinDiff, + Identification donorIdentification) + { + double minDiff = 5 * PeriodicTable.GetElement("H").PrincipalIsotope.AtomicMass; + double maxDiff = 11 * PeriodicTable.GetElement("H").PrincipalIsotope.AtomicMass; + double donorPeakPeakfindingMass = donorIdentification.PeakfindingMass; + + // Theoretically we could do a binary search but we're just going to iterate through the whole list of donor peaks + List randomPeakCandidates = peaksOrderedByMass + .Where(p => + p.ApexRetentionTime > 0 + && Math.Abs(p.ApexRetentionTime - donorPeakRetentionTime) > retentionTimeMinDiff + && p.Identifications.First().BaseSequence != donorIdentification.BaseSequence + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) > minDiff + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) < maxDiff) + .ToList(); + + while (!randomPeakCandidates.Any() & maxDiff < 1e5) + { + // Increase the search space by a factor of 10 and try again + maxDiff *= 10; + randomPeakCandidates = peaksOrderedByMass + .Where(p => + p.ApexRetentionTime > 0 + && Math.Abs(p.ApexRetentionTime - donorPeakRetentionTime) > retentionTimeMinDiff + && p.Identifications.First().BaseSequence != donorIdentification.BaseSequence + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) > minDiff + && Math.Abs(p.Identifications.First().PeakfindingMass - donorPeakPeakfindingMass) < maxDiff) + .ToList(); + } + + if (!randomPeakCandidates.Any()) + { + return null; + } + + // Generates a pseudo-random number based on the donor peak finding mass + retention time + int pseudoRandomNumber = (int)(1e5 * (donorIdentification.PeakfindingMass % 1.0) * (donorIdentification.Ms2RetentionTimeInMinutes % 1.0)) % randomPeakCandidates.Count; + return randomPeakCandidates[pseudoRandomNumber]; + } + /// /// This method maps identified peaks from other chromatographic runs ("donors") onto /// the defined chromatographic run ("acceptor"). The goal is to reduce the number of missing @@ -722,7 +911,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // these are the analytes already identified in this run. we don't need to try to match them from other runs var acceptorFileIdentifiedSequences = new HashSet(acceptorFileIdentifiedPeaks - .Where(peak => peak.IsotopicEnvelopes.Any()) + .Where(peak => peak.IsotopicEnvelopes.Any() && peak.Identifications.Min(id => id.QValue) < 0.01) .SelectMany(p => p.Identifications.Select(d => d.ModifiedSequence))); MbrScorer scorer = BuildMbrScorer(acceptorFileIdentifiedPeaks, out var mbrTol); @@ -748,24 +937,24 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } // this stores the results of MBR - var matchBetweenRunsIdentifiedPeaks = new Dictionary>>(); + ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks = new(); // map each donor file onto this file - foreach (SpectraFileInfo idDonorFile in _spectraFileInfo) + foreach (var donorFilePeakListKvp in DonorFileToPeakDict) { - if (idAcceptorFile.Equals(idDonorFile)) + if (idAcceptorFile.Equals(donorFilePeakListKvp.Key)) { continue; } // this is the list of peaks identified in the other file but not in this one ("ID donor peaks") - List idDonorPeaks = _results.Peaks[idDonorFile].Where(p => - !p.IsMbrPeak - && p.NumIdentificationsByFullSeq == 1 - && p.IsotopicEnvelopes.Any() - && PeptidesModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence) // Only do MBR for peptides that we want to quantify - && !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) - && (!RequireMsmsIdInCondition || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g))))).ToList(); + List idDonorPeaks = donorFilePeakListKvp.Value + .Where(p => + !acceptorFileIdentifiedSequences.Contains(p.Identifications.First().ModifiedSequence) + && (!RequireMsmsIdInCondition + || p.Identifications.Any(v => v.ProteinGroups.Any(g => thisFilesMsmsIdentifiedProteins.Contains(g)))) + && this.PeptideModifiedSequencesToQuantify.Contains(p.Identifications.First().ModifiedSequence)) + .ToList(); if (!idDonorPeaks.Any()) { @@ -773,7 +962,7 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) } bool donorSampleIsFractionated = _results.SpectraFiles - .Where(p => p.Condition == idDonorFile.Condition && p.BiologicalReplicate == idDonorFile.BiologicalReplicate) + .Where(p => p.Condition == donorFilePeakListKvp.Key.Condition && p.BiologicalReplicate == donorFilePeakListKvp.Key.BiologicalReplicate) .Select(p => p.Fraction) .Distinct() .Count() > 1; @@ -781,21 +970,22 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) // We're only interested in the fold change if the conditions are different. Otherwise, we score based off of the intensities // of the acceptor file if (_spectraFileInfo.Select(p => p.Condition).Distinct().Count() > 1 - && idDonorFile.Condition != idAcceptorFile.Condition) + && donorFilePeakListKvp.Key.Condition != idAcceptorFile.Condition) { scorer.CalculateFoldChangeBetweenFiles(idDonorPeaks); } // generate RT calibration curve - RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(idDonorFile, idAcceptorFile, scorer); + RetentionTimeCalibDataPoint[] rtCalibrationCurve = GetRtCalSpline(donorFilePeakListKvp.Key, idAcceptorFile, scorer, out var donorPeaksMassOrdered); + + // break if MBR transfers can't be scored + if (!scorer.IsValid(donorFilePeakListKvp.Key)) continue; // Loop through every MSMS id in the donor file Parallel.ForEach(Partitioner.Create(0, idDonorPeaks.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, (range, loopState) => { - var matchBetweenRunsIdentifiedPeaksThreadSpecific = new Dictionary>>(); - for (int i = range.Item1; i < range.Item2; i++) { ChromatographicPeak donorPeak = idDonorPeaks[i]; @@ -803,49 +993,91 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) RtInfo rtInfo = PredictRetentionTime(rtCalibrationCurve, donorPeak, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); if (rtInfo == null) continue; - FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, matchBetweenRunsIdentifiedPeaksThreadSpecific); - } + // Look for MBR target (predicted-RT peak) + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out var bestAcceptor); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestAcceptor, donorPeak.Identifications.First()); + + //Draw a random donor that has an rt sufficiently far enough away + double minimumRtDifference = rtInfo.Width*2; + ChromatographicPeak randomDonor = GetRandomPeak(donorPeaksMassOrdered, + donorPeak.ApexRetentionTime, + minimumRtDifference, + donorPeak.Identifications.First()); + + // Look for MBR decoy (random-RT peak) + ChromatographicPeak bestDecoy = null; + RtInfo decoyRtInfo = null; + if (randomDonor != null) + { + decoyRtInfo = PredictRetentionTime(rtCalibrationCurve, randomDonor, idAcceptorFile, acceptorSampleIsFractionated, donorSampleIsFractionated); + if (decoyRtInfo != null) + { + //Find a decoy peak using the randomly drawn retention time + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestDecoy, + randomRt: decoyRtInfo.PredictedRt); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestDecoy, donorPeak.Identifications.First()); + } + } - lock (matchBetweenRunsIdentifiedPeaks) - { - foreach (var kvp in matchBetweenRunsIdentifiedPeaksThreadSpecific) + double windowWidth = Math.Max(0.5, rtInfo.Width); + // If the search turned up empty, try again with a wider search window + while (bestAcceptor == null && bestDecoy == null) { - if (matchBetweenRunsIdentifiedPeaks.TryGetValue(kvp.Key, out var list)) + windowWidth = Math.Min(windowWidth, MbrRtWindow); + rtInfo.Width = windowWidth; + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestAcceptor); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestAcceptor, donorPeak.Identifications.First()); + + if(decoyRtInfo != null) + { + decoyRtInfo.Width = windowWidth; + FindAllAcceptorPeaks(idAcceptorFile, scorer, rtInfo, mbrTol, donorPeak, out bestDecoy, + randomRt: decoyRtInfo.PredictedRt); + AddPeakToConcurrentDict(matchBetweenRunsIdentifiedPeaks, bestDecoy, donorPeak.Identifications.First()); + } + if (windowWidth >= MbrRtWindow) { - foreach (var peak in kvp.Value) - { - if (list.TryGetValue(peak.Key, out List existing)) - { - foreach (var acceptorPeak in peak.Value) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.Identifications.Add(acceptorPeak.Identifications.First()); - } - else - { - existing.Add(acceptorPeak); - } - } - } - else - { - list.Add(peak.Key, peak.Value); - } - } + break; } else { - matchBetweenRunsIdentifiedPeaks.Add(kvp.Key, kvp.Value); + windowWidth += 0.5; } } + } }); } + // Eliminate duplicate peaks (not sure where they come from) + foreach (var seqDictionaryKvp in matchBetweenRunsIdentifiedPeaks) + { + // Each isotopic envelope is linked to a list of ChromatographicPeaks + // Here, we remove instances where the same envelope is associated with multiple chromatographic peaks but the peaks correspond to the same donor peptide + // I don't know why this happens lol + // If multiple peaks are associated with the same envelope, and they have different associated peptide identifications, then they're kept separate. + foreach (var envelopePeakListKvp in seqDictionaryKvp.Value) + { + List bestPeaks = new(); + foreach (var peakGroup in envelopePeakListKvp.Value.GroupBy(peak => peak.Identifications.First().ModifiedSequence)) + { + bestPeaks.Add(peakGroup.MaxBy(peak => peak.MbrScore)); + } + envelopePeakListKvp.Value.Clear(); + envelopePeakListKvp.Value.AddRange(bestPeaks); + } + } + + // Create a dictionary that stores imsPeak associated with an ms/ms identified peptide + Dictionary> msmsImsPeaks = _results.Peaks[idAcceptorFile] + .Where(peak => + !peak.DecoyPeptide + && peak.Apex?.IndexedPeak != null + && PeptideModifiedSequencesToQuantify.Contains(peak.Identifications.First().ModifiedSequence)) + .Select(peak => peak.Apex.IndexedPeak) + .GroupBy(imsPeak => imsPeak.ZeroBasedMs1ScanIndex) + .ToDictionary(g => g.Key, g => g.ToList()); + // take the best result (highest scoring) for each peptide after we've matched from all the donor files foreach (var mbrIdentifiedPeptide in matchBetweenRunsIdentifiedPeaks.Where(p => !acceptorFileIdentifiedSequences.Contains(p.Key))) { @@ -855,36 +1087,101 @@ private void QuantifyMatchBetweenRunsPeaks(SpectraFileInfo idAcceptorFile) continue; } - List peakHypotheses = mbrIdentifiedPeptide.Value.SelectMany(p => p.Value).OrderByDescending(p => p.MbrScore).ToList(); - - ChromatographicPeak best = peakHypotheses.First(); - - peakHypotheses.Remove(best); - - if (peakHypotheses.Count > 0) + foreach (var peakHypothesisGroup in mbrIdentifiedPeptide.Value.SelectMany(kvp => kvp.Value).OrderByDescending(p => p.MbrScore).GroupBy(p => p.RandomRt)) { - double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); - double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); + var peakHypotheses = peakHypothesisGroup.ToList(); + ChromatographicPeak best = peakHypotheses.First(); + peakHypotheses.Remove(best); - List peaksToRemoveFromHypotheses = new List(); - foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) + // Discard any peaks that are already associated with an ms/ms identified peptide + while (best?.Apex?.IndexedPeak != null && msmsImsPeaks.TryGetValue(best.Apex.IndexedPeak.ZeroBasedMs1ScanIndex, out var peakList)) { - if (peak.Apex.IndexedPeak.RetentionTime > start && peak.Apex.IndexedPeak.RetentionTime < end) + if (peakList.Contains(best.Apex.IndexedPeak)) + { + if (!peakHypotheses.Any()) + { + best = null; + break; + } + best = peakHypotheses.First(); + peakHypotheses.Remove(best); + } + else { - best.MergeFeatureWith(peak, Integrate); + break; + } + } + if (best == null) continue; - peaksToRemoveFromHypotheses.Add(peak); + // merge peaks with different charge states + if (peakHypotheses.Count > 0) + { + double start = best.IsotopicEnvelopes.Min(p => p.IndexedPeak.RetentionTime); + double end = best.IsotopicEnvelopes.Max(p => p.IndexedPeak.RetentionTime); + + _results.Peaks[idAcceptorFile].Add(best); + foreach (ChromatographicPeak peak in peakHypotheses.Where(p => p.Apex.ChargeState != best.Apex.ChargeState)) + { + if (peak.Apex.IndexedPeak.RetentionTime >= start + && peak.Apex.IndexedPeak.RetentionTime <= end) + //&& Math.Abs(peak.MbrScore - best.MbrScore) / best.MbrScore < 0.25)// 25% difference is a rough heuristic, but I don't want super shitty peaks being able to supercede the intensity of a good peak! + { + if (msmsImsPeaks.TryGetValue(peak.Apex.IndexedPeak.ZeroBasedMs1ScanIndex, out var peakList) && peakList.Contains(peak.Apex.IndexedPeak)) + { + continue; // If the peak is already accounted for, skip it. + } + else + { + best.MergeFeatureWith(peak, Integrate); + } + } } } + _results.Peaks[idAcceptorFile].Add(best); } - - _results.Peaks[idAcceptorFile].Add(best); } - RunErrorChecking(idAcceptorFile); } + /// + /// A concurrent dictionary is used to keep track of MBR peaks that have been identified in the acceptor file. This function updates that dictionary + /// + /// concurrent dictionary. Key = Peptide sequence. Value = ConcurrentDictionary mapping where keys are isotopic envelopes and values are list of associated peaks + /// Peak to add to the dictionary + /// The donor ID associated with the MBR peaks + private void AddPeakToConcurrentDict(ConcurrentDictionary>> matchBetweenRunsIdentifiedPeaks, + ChromatographicPeak peakToSave, + Identification donorIdentification) + { + if(peakToSave == null) + { + return; + } + // save the peak hypothesis + matchBetweenRunsIdentifiedPeaks.AddOrUpdate + ( + // new key + key: donorIdentification.ModifiedSequence, + // if we are adding a value for the first time, we simply create a new dictionatry with one entry + addValueFactory: (sequenceKey) => + new ConcurrentDictionary>( + new Dictionary> + { + { peakToSave.Apex, new List { peakToSave } } + }), + // if the key (sequence) already exists, we have to add the new peak to the existing dictionary + updateValueFactory: (sequenceKey, envelopePeakListDict) => + { + envelopePeakListDict.AddOrUpdate( + key: peakToSave.Apex, + addValueFactory: (envelopeKey) => new List { peakToSave }, // if the key (envelope) doesnt exist, just create a new list + updateValueFactory: (envelopeKey, peakList) => { peakList.Add(peakToSave); return peakList; }); // if the key (envelope) already exists, add the peak to the associated list + return envelopePeakListDict; + } + ); + } + /// /// Finds MBR acceptor peaks by looping through every possible peak for every possible charge state /// in a given retention time range. Identified peaks are added to the matchBetweenRunsIdentifiedPeaks dictionary. @@ -900,21 +1197,24 @@ internal void FindAllAcceptorPeaks( RtInfo rtInfo, Tolerance fileSpecificTol, ChromatographicPeak donorPeak, - Dictionary>> matchBetweenRunsIdentifiedPeaksThreadSpecific) + out ChromatographicPeak bestAcceptor, + double? randomRt = null) { // get the MS1 scan info for this region so we can look up indexed peaks Ms1ScanInfo[] ms1ScanInfos = _ms1Scans[idAcceptorFile]; Ms1ScanInfo start = ms1ScanInfos[0]; Ms1ScanInfo end = ms1ScanInfos[ms1ScanInfos.Length - 1]; + double rtStartHypothesis = randomRt == null ? rtInfo.RtStartHypothesis : (double)randomRt - (rtInfo.Width / 2.0); + double rtEndHypothesis = randomRt == null ? rtInfo.RtEndHypothesis : (double)randomRt + (rtInfo.Width / 2.0); for (int j = 0; j < ms1ScanInfos.Length; j++) { Ms1ScanInfo scan = ms1ScanInfos[j]; - if (scan.RetentionTime <= rtInfo.RtStartHypothesis) + if (scan.RetentionTime <= rtStartHypothesis) { start = scan; } - if (scan.RetentionTime >= rtInfo.RtEndHypothesis) + if (scan.RetentionTime >= rtEndHypothesis) { end = scan; break; @@ -930,6 +1230,7 @@ internal void FindAllAcceptorPeaks( } Identification donorIdentification = donorPeak.Identifications.First(); + bestAcceptor = null; foreach (int z in chargesToMatch) { @@ -951,36 +1252,13 @@ internal void FindAllAcceptorPeaks( while (chargeEnvelopes.Any()) { ChromatographicPeak acceptorPeak = FindIndividualAcceptorPeak(idAcceptorFile, scorer, donorPeak, - fileSpecificTol, rtInfo, z, chargeEnvelopes); + fileSpecificTol, rtInfo, z, chargeEnvelopes, randomRt); if (acceptorPeak == null) - continue; - - // save the peak hypothesis - if (matchBetweenRunsIdentifiedPeaksThreadSpecific.TryGetValue(donorIdentification.ModifiedSequence, out var mbrPeaks)) - { - if (mbrPeaks.TryGetValue(acceptorPeak.Apex, out List existing)) - { - var samePeakSameSequence = existing - .FirstOrDefault(p => p.Identifications.First().ModifiedSequence == acceptorPeak.Identifications.First().ModifiedSequence); - - if (samePeakSameSequence != null) - { - samePeakSameSequence.Identifications.Add(donorIdentification); - } - else - { - existing.Add(acceptorPeak); - } - } - else - { - mbrPeaks.Add(acceptorPeak.Apex, new List { acceptorPeak }); - } - } - else + continue; + if (bestAcceptor == null || bestAcceptor.MbrScore < acceptorPeak.MbrScore) { - matchBetweenRunsIdentifiedPeaksThreadSpecific.Add(donorIdentification.ModifiedSequence, new Dictionary>()); - matchBetweenRunsIdentifiedPeaksThreadSpecific[donorIdentification.ModifiedSequence].Add(acceptorPeak.Apex, new List { acceptorPeak }); + acceptorPeak.ChargeList = chargesToMatch; + bestAcceptor = acceptorPeak; } } } @@ -1004,10 +1282,11 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( Tolerance mbrTol, RtInfo rtInfo, int z, - List chargeEnvelopes) + List chargeEnvelopes, + double? randomRt = null) { - var donorId = donorPeak.Identifications.First(); - var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, predictedRetentionTime: rtInfo.PredictedRt); + var donorId = donorPeak.Identifications.OrderBy(p => p.QValue).First(); + var acceptorPeak = new ChromatographicPeak(donorId, true, idAcceptorFile, randomRt != null); // Grab the first scan/envelope from charge envelopes. This should be the most intense envelope in the list IsotopicEnvelope seedEnv = chargeEnvelopes.First(); @@ -1030,7 +1309,7 @@ internal ChromatographicPeak FindIndividualAcceptorPeak( return null; } - acceptorPeak.CalculateMbrScore(scorer, donorPeak); + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, randomRt ?? rtInfo.PredictedRt); return acceptorPeak; } @@ -1076,9 +1355,9 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) { if (!tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - if (PeptidesModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence)) + if (PeptideModifiedSequencesToQuantify.Contains(tryPeak.Identifications.First().ModifiedSequence)) { - if (PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) + if (PeptideModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) { storedPeak.MergeFeatureWith(tryPeak, Integrate); } @@ -1096,14 +1375,20 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) } else if (tryPeak.IsMbrPeak && !storedPeak.IsMbrPeak) { - if(PeptidesModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) + // Default to MSMS peaks over MBR Peaks. + // Most of these have already been eliminated + // However, sometimes merging MBR peaks with different charge states reveals that + // The MBR peak conflicts with an MSMS peak + // Removing the peak when this happens is a conservative step. + // Sometimes the MSMS peak is a decoy, or has a peptides level Q-value < 0.01 (i.e., the modified sequence isn't in PeptideModifiedSequencesToQuantify). + // In this case, we keep the MBR peak. + if (storedPeak.DecoyPeptide || !PeptideModifiedSequencesToQuantify.Contains(storedPeak.Identifications.First().ModifiedSequence)) { - continue; + errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak; } else { - // If the stored peak id isn't in the list of peptides to quantify, overwrite it - errorCheckedPeaksGroupedByApex[tryPeak.Apex.IndexedPeak] = tryPeak; + continue; } } else if (tryPeak.IsMbrPeak && storedPeak.IsMbrPeak) @@ -1129,6 +1414,140 @@ private void RunErrorChecking(SpectraFileInfo spectraFile) _results.Peaks[spectraFile] = errorCheckedPeaks; } + private bool RunPEPAnalysis() + { + List mbrPeaks = _results.Peaks.SelectMany(kvp => kvp.Value) + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .ToList(); + + if (!mbrPeaks.IsNotNullOrEmpty()) return false; + int decoyPeakTotal = mbrPeaks.Count(peak => peak.RandomRt); + + List tempPepQs = new(); + List tempQs = new(); + if (mbrPeaks.Count > 100 && decoyPeakTotal > 20) + { + PepAnalysisEngine pepAnalysisEngine = new PepAnalysisEngine(mbrPeaks, + outputFolder: Path.GetDirectoryName(_spectraFileInfo.First().FullFilePathWithExtension), + maxThreads: MaxThreads, + pepTrainingFraction: PepTrainingFraction); + var pepOutput = pepAnalysisEngine.ComputePEPValuesForAllPeaks(); + + _results.PepResultString = pepOutput; + + return true; + } + return false; + } + + /// + /// Calculates the FDR for each MBR-detected peak using decoy peaks and decoy peptides, + /// Then filters out all peaks below a given FDR threshold + /// + private void CalculateFdrForMbrPeaks(SpectraFileInfo acceptorFile, bool usePep) + { + List mbrPeaks; + if (usePep) + { + // Take only the top scoring acceptor for each donor (acceptor can be target or decoy!) + // Maybe we're sorting twice when we don't have to but idk if order is preserved using group by + mbrPeaks = _results.Peaks[acceptorFile] + .Where(peak => peak.IsMbrPeak) + .GroupBy(peak => peak.Identifications.First()) + .Select(group => group.OrderBy(peak => peak.MbrPep).ThenByDescending(peak => peak.MbrScore).First()) + .OrderBy(peak => peak.MbrPep) + .ThenByDescending(peak => peak.MbrScore) + .ToList(); + + _results.Peaks[acceptorFile] = mbrPeaks.Concat(_results.Peaks[acceptorFile].Where(peak => !peak.IsMbrPeak)).ToList(); + } + else + { + // If PEP wasn't performed, things probably aren't calibrated very well, and so it's better + // To err on the safe side and not remove the decoys + mbrPeaks = _results.Peaks[acceptorFile] + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .ToList(); + } + + if (!mbrPeaks.IsNotNullOrEmpty()) return; + + List tempQs = new(); + int totalPeaks = 0; + int decoyPeptides = 0; + int decoyPeaks = 0; + int doubleDecoys = 0; + for (int i = 0; i < mbrPeaks.Count; i++) + { + totalPeaks++; + switch (mbrPeaks[i]) + { + case ChromatographicPeak p when (!p.DecoyPeptide && !p.RandomRt): + break; + case ChromatographicPeak p when (p.DecoyPeptide && !p.RandomRt): + decoyPeptides++; + break; + case ChromatographicPeak p when (!p.DecoyPeptide && p.RandomRt): + decoyPeaks++; + break; + case ChromatographicPeak p when (p.DecoyPeptide && p.RandomRt): + doubleDecoys++; + break; + } + + // There are two parts to this score. We're summing the PEPs of peaks derived from target peptides. For peaks derived from decoy peptides, + // We do the double decoy things where we count decoyPeptidePeaks - doubleDecoypeaks + tempQs.Add(Math.Round(EstimateFdr(doubleDecoys, decoyPeptides, decoyPeaks, totalPeaks), 6)); + } + + // Set the q-value for each peak + double[] correctedQs = CorrectQValues(tempQs); + for (int i = 0; i < correctedQs.Length; i++) + { + mbrPeaks[i].MbrQValue = correctedQs[i]; + } + } + + private int EstimateDecoyPeptideErrors(int decoyPeptideCount, int doubleDecoyCount) + { + return Math.Max(0, decoyPeptideCount - doubleDecoyCount); + } + + private double EstimateFdr(int doubleDecoyCount, int decoyPeptideCount, int decoyPeakCount, int totalPeakCount) + { + return (double)(1 + decoyPeakCount + EstimateDecoyPeptideErrors(decoyPeptideCount, doubleDecoyCount)) / totalPeakCount; + } + + /// + /// Standard q-value correction, ensures that in a list of temporary q-values, a q-value is equal to + /// Min(q-values, every q-value below in the list). As you work your way down a list of q-values, the value should only increase or stay the same. + /// + /// + /// + private double[] CorrectQValues(List tempQs) + { + if (!tempQs.IsNotNullOrEmpty()) return null; + double[] correctedQValues = new double[tempQs.Count]; + correctedQValues[tempQs.Count - 1] = tempQs.Last(); + for(int i = tempQs.Count-2; i >=0; i--) + { + if (tempQs[i] > correctedQValues[i+1]) + { + correctedQValues[i] = correctedQValues[i + 1]; + } + else + { + correctedQValues[i] = tempQs[i]; + } + } + + return correctedQValues; + } + + #endregion + /// /// Takes in a list of imsPeaks and finds all the isotopic peaks in each scan. If the experimental isotopic distribution /// matches the theoretical distribution, an IsotopicEnvelope object is created from the summed intensities of each isotopic peak. @@ -1228,7 +1647,7 @@ public List GetIsotopicEnvelopes( } // Check that the experimental envelope matches the theoretical - if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance)) + if (CheckIsotopicEnvelopeCorrelation(massShiftToIsotopePeaks, peak, chargeState, isotopeTolerance, out var pearsonCorr)) { // impute unobserved isotope peak intensities // TODO: Figure out why value imputation is performed. Build a toggle? @@ -1240,7 +1659,7 @@ public List GetIsotopicEnvelopes( } } - isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum())); + isotopicEnvelopes.Add(new IsotopicEnvelope(peak, chargeState, experimentalIsotopeIntensities.Sum(), pearsonCorr)); } } @@ -1261,9 +1680,10 @@ public bool CheckIsotopicEnvelopeCorrelation( Dictionary> massShiftToIsotopePeaks, IndexedMassSpectralPeak peak, int chargeState, - Tolerance isotopeTolerance) + Tolerance isotopeTolerance, + out double pearsonCorrelation) { - double pearsonCorrelation = Correlation.Pearson( + pearsonCorrelation = Correlation.Pearson( massShiftToIsotopePeaks[0].Select(p => p.expIntensity), massShiftToIsotopePeaks[0].Select(p => p.theorIntensity)); diff --git a/mzLib/FlashLFQ/Identification.cs b/mzLib/FlashLFQ/Identification.cs index 85c557c3e..2ee9bd1b4 100644 --- a/mzLib/FlashLFQ/Identification.cs +++ b/mzLib/FlashLFQ/Identification.cs @@ -15,11 +15,15 @@ public class Identification public readonly ChemicalFormula OptionalChemicalFormula; public readonly bool UseForProteinQuant; public double PeakfindingMass; + public double PsmScore { get; init; } + public double QValue { get; init; } + public bool IsDecoy { get; } public Identification(SpectraFileInfo fileInfo, string BaseSequence, string ModifiedSequence, double monoisotopicMass, double ms2RetentionTimeInMinutes, int chargeState, List proteinGroups, - ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true) + ChemicalFormula optionalChemicalFormula = null, bool useForProteinQuant = true, + double psmScore = 0, double qValue = 0, bool decoy = false) { this.FileInfo = fileInfo; this.BaseSequence = BaseSequence; @@ -29,7 +33,10 @@ public Identification(SpectraFileInfo fileInfo, string BaseSequence, string Modi this.PrecursorChargeState = chargeState; this.ProteinGroups = new HashSet(proteinGroups); this.OptionalChemicalFormula = optionalChemicalFormula; - UseForProteinQuant = useForProteinQuant; + UseForProteinQuant = !decoy && useForProteinQuant; // ensure that decoy peptides aren't used for protein quant + QValue = qValue; + PsmScore = psmScore; + IsDecoy = decoy; } public override string ToString() diff --git a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs index c9aa89042..cdadc56eb 100644 --- a/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs +++ b/mzLib/FlashLFQ/IndexedMassSpectralPeak.cs @@ -29,7 +29,7 @@ public override bool Equals(object obj) public override int GetHashCode() { - return Mz.GetHashCode(); + return HashCode.Combine(Mz, ZeroBasedMs1ScanIndex); } public override string ToString() diff --git a/mzLib/FlashLFQ/IsotopicEnvelope.cs b/mzLib/FlashLFQ/IsotopicEnvelope.cs index 09d7207d7..938ac7850 100644 --- a/mzLib/FlashLFQ/IsotopicEnvelope.cs +++ b/mzLib/FlashLFQ/IsotopicEnvelope.cs @@ -11,11 +11,12 @@ public class IsotopicEnvelope public readonly IndexedMassSpectralPeak IndexedPeak; public readonly int ChargeState; - public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity) + public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeState, double intensity, double pearsonCorrelation) { IndexedPeak = monoisotopicPeak; ChargeState = chargeState; Intensity = intensity / chargeState; + PearsonCorrelation = pearsonCorrelation; } /// @@ -25,6 +26,9 @@ public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeStat /// public double Intensity { get; private set; } + + public double PearsonCorrelation { get; init; } + public void Normalize(double normalizationFactor) { Intensity *= normalizationFactor; diff --git a/mzLib/FlashLFQ/MbrScorer.cs b/mzLib/FlashLFQ/MbrScorer.cs index a703cf3b7..72c2ee72d 100644 --- a/mzLib/FlashLFQ/MbrScorer.cs +++ b/mzLib/FlashLFQ/MbrScorer.cs @@ -1,8 +1,10 @@ -using MathNet.Numerics.Distributions; +using Easy.Common.EasyComparer; +using MathNet.Numerics.Distributions; using MathNet.Numerics.Statistics; using System; using System.Collections.Generic; using System.Data; +using System.Data.Entity.ModelConfiguration.Conventions; using System.Linq; namespace FlashLFQ @@ -13,16 +15,18 @@ namespace FlashLFQ /// internal class MbrScorer { - internal SpectraFileInfo AcceptorFile { get; init; } // Intensity and ppm distributions are specific to each acceptor file private readonly Normal _logIntensityDistribution; private readonly Normal _ppmDistribution; private readonly Normal _scanCountDistribution; + private readonly Gamma _isotopicCorrelationDistribution; // The logFcDistributions and rtDifference distributions are unique to each donor file - acceptor file pair private Dictionary _logFcDistributionDictionary; private Dictionary _rtPredictionErrorDistributionDictionary; + internal Dictionary ApexToAcceptorFilePeakDict { get; } - internal List UnambiguousMsMsPeaks { get; } + internal List UnambiguousMsMsAcceptorPeaks { get; } + internal double MaxNumberOfScansObserved { get; } /// /// Takes in an intensity distribution, a log foldchange distribution, and a ppm distribution @@ -30,32 +34,47 @@ internal class MbrScorer /// internal MbrScorer( Dictionary apexToAcceptorFilePeakDict, - List acceptorPeaks, + List acceptorFileMsmsPeaks, Normal ppmDistribution, Normal logIntensityDistribution) { - AcceptorFile = acceptorPeaks.First().SpectraFileInfo; ApexToAcceptorFilePeakDict = apexToAcceptorFilePeakDict; - UnambiguousMsMsPeaks = acceptorPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); + UnambiguousMsMsAcceptorPeaks = acceptorFileMsmsPeaks.Where(p => p.Apex != null && !p.IsMbrPeak && p.NumIdentificationsByFullSeq == 1).ToList(); + MaxNumberOfScansObserved = acceptorFileMsmsPeaks.Max(peak => peak.ScanCount); _logIntensityDistribution = logIntensityDistribution; _ppmDistribution = ppmDistribution; + _isotopicCorrelationDistribution = GetIsotopicEnvelopeCorrDistribution(); _logFcDistributionDictionary = new(); _rtPredictionErrorDistributionDictionary = new(); // This is kludgey, because scan counts are discrete - List scanList = acceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); + List scanList = UnambiguousMsMsAcceptorPeaks.Select(peak => (double)peak.ScanCount).ToList(); // build a normal distribution for the scan list of the acceptor peaks - // InterQuartileRange / 1.35 = StandardDeviation for a normal distribution _scanCountDistribution = new Normal(scanList.Average(), scanList.Count > 30 ? scanList.StandardDeviation() : scanList.InterquartileRange() / 1.36); } + /// + /// This distribution represents (1 - Pearson Correlation) for isotopic envelopes of MS/MS acceptor peaks + /// + /// + private Gamma GetIsotopicEnvelopeCorrDistribution() + { + var pearsonCorrs = UnambiguousMsMsAcceptorPeaks.Select(p => 1 - p.IsotopicPearsonCorrelation).Where(p => p > 0).ToList(); + if (pearsonCorrs.Count <= 1) return null; + double mean = pearsonCorrs.Mean(); + double variance = pearsonCorrs.Variance(); + var alpha = Math.Pow(mean, 2) / variance; + var beta = mean / variance; + return new Gamma(alpha, beta); + } + /// /// Takes in a list of retention time differences for anchor peptides (donor RT - acceptor RT) and uses /// this list to calculate the distribution of prediction errors of the local RT alignment strategy employed by /// match-between-runs for the specified donor file /// /// List of retention time differences (doubles) calculated as donor file RT - acceptor file RT - internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List anchorPeptideRtDiffs, int numberOfAnchorPeptidesPerSide) + internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List anchorPeptideRtDiffs, int numberOfAnchorPeptides) { // in MBR, we use anchor peptides on either side of the donor to predict the retention time // here, we're going to repeat the same process, using neighboring anchor peptides to predicte the Rt shift for each @@ -66,40 +85,88 @@ internal void AddRtPredErrorDistribution(SpectraFileInfo donorFile, List double cumSumRtDiffs; List rtPredictionErrors = new(); - for (int i = numberOfAnchorPeptidesPerSide; i < anchorPeptideRtDiffs.Count - (numberOfAnchorPeptidesPerSide) ; i++) + for (int i = numberOfAnchorPeptides; i < (anchorPeptideRtDiffs.Count - numberOfAnchorPeptides); i++) { cumSumRtDiffs = 0; - for(int j = 1; j <= numberOfAnchorPeptidesPerSide; j++) + for(int j = 1; j <= numberOfAnchorPeptides; j++) { cumSumRtDiffs += anchorPeptideRtDiffs[i - j]; cumSumRtDiffs += anchorPeptideRtDiffs[i + j]; } - double avgDiff = cumSumRtDiffs / (2 * numberOfAnchorPeptidesPerSide); + double avgDiff = cumSumRtDiffs / (2 * numberOfAnchorPeptides); rtPredictionErrors.Add(avgDiff - anchorPeptideRtDiffs[i]); } - if(!rtPredictionErrors.Any()) + Normal rtPredictionErrorDist = new Normal(0, 0); + // Default distribution. Effectively assigns a RT Score of zero if no alignment can be performed + // between the donor and acceptor based on shared MS/MS IDs + + if(rtPredictionErrors.Any()) { - _rtPredictionErrorDistributionDictionary.Add(donorFile, new Normal(0, 1)); - return; + double medianRtError = rtPredictionErrors.Median(); + double stdDevRtError = rtPredictionErrors.StandardDeviation(); + if(stdDevRtError >= 0.0 && !double.IsNaN(medianRtError)) + { + rtPredictionErrorDist = new Normal(medianRtError, 1); + } } + + _rtPredictionErrorDistributionDictionary.Add(donorFile, rtPredictionErrorDist); + } - double medianRtError = rtPredictionErrors.Median(); - double stdDevRtError = rtPredictionErrors.StandardDeviation(); - - _rtPredictionErrorDistributionDictionary.Add(donorFile, new Normal(medianRtError, stdDevRtError)); + /// + /// Takes in a list of retention time differences for anchor peptides (donor RT - acceptor RT) and uses + /// this list to calculate the distribution of prediction errors of the local RT alignment strategy employed by + /// match-between-runs for the specified donor file + /// + /// An MBR Score ranging between 0 and 100. Higher scores are better. + internal double ScoreMbr(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak, double predictedRt) + { + acceptorPeak.IntensityScore = CalculateIntensityScore(acceptorPeak.Intensity, donorPeak); + acceptorPeak.RtPredictionError = predictedRt - acceptorPeak.ApexRetentionTime; + acceptorPeak.RtScore = CalculateScore(_rtPredictionErrorDistributionDictionary[donorPeak.SpectraFileInfo], + acceptorPeak.RtPredictionError); + acceptorPeak.PpmScore = CalculateScore(_ppmDistribution, acceptorPeak.MassError); + acceptorPeak.ScanCountScore = CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); + acceptorPeak.IsotopicDistributionScore = CalculateScore(_isotopicCorrelationDistribution, 1 - acceptorPeak.IsotopicPearsonCorrelation); + + // Returns 100 times the geometric mean of the four scores (scan count, intensity score, rt score, ppm score) + return 100 * Math.Pow(acceptorPeak.IntensityScore + * acceptorPeak.RtScore + * acceptorPeak.PpmScore + * acceptorPeak.ScanCountScore + * acceptorPeak.IsotopicDistributionScore, 0.20); } - private double CalculateScore(Normal distribution, double value) + // Setting a minimum score prevents the MBR score from going to zero if one component of that score is 0 + // 3e-7 is the fraction of a normal distribution that lies at least 5 stdDev away from the mean + private double _minScore = 3e-7; + + internal double CalculateScore(Normal distribution, double value) { + // new method double absoluteDiffFromMean = Math.Abs(distribution.Mean - value); // Returns a value between (0, 1] where 1 means the value was equal to the distribution mean - return 2 * distribution.CumulativeDistribution(distribution.Mean - absoluteDiffFromMean); + // The score represents the fraction of the distribution that lies absoluteDiffFromMean away from the mean or further + // i.e., what fraction of the distribution is more extreme than value + double score = 2 * distribution.CumulativeDistribution(distribution.Mean - absoluteDiffFromMean); + return (double.IsNaN(score) || score == 0) ? _minScore : score; } - internal double CalculateIntensityScore(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak) + internal double CalculateScore(Gamma distribution, double value) + { + if (value < 0 || distribution == null) + { + return _minScore; + } + + // For the gamma distribtuion, the CDF is 0 when the pearson correlation is equal to 1 (value = 0) + // The CDF then rapidly rises, reaching ~1 at a value of 0.3 (corresponding to a pearson correlation of 0.7) + return 1 - distribution.CumulativeDistribution(value); + } + + internal double CalculateIntensityScore(double acceptorIntensity, ChromatographicPeak donorPeak) { - double acceptorIntensity = acceptorPeak.Intensity; if (donorPeak != null && acceptorIntensity != 0 && donorPeak.Intensity != 0 && _logFcDistributionDictionary.TryGetValue(donorPeak.SpectraFileInfo, out var logFcDistribution)) { @@ -111,38 +178,7 @@ internal double CalculateIntensityScore(ChromatographicPeak acceptorPeak, Chroma var logIntensity = Math.Log(acceptorIntensity, 2); return CalculateScore(_logIntensityDistribution, logIntensity); } - } - /// - /// Calculates the retention time score for a given MbrAcceptor by comparing to the - /// distribution of all retention time prediction errors for all anchor peptides shared between - /// the donor and acceptor files - /// - /// Score bounded by 0 and 1, where higher scores are better - internal double CalculateRetentionTimeScore(ChromatographicPeak acceptorPeak, ChromatographicPeak donorPeak) - { - double rtPredictionError = acceptorPeak.PredictedRetentionTime - acceptorPeak.ApexRetentionTime; - return CalculateScore(_rtPredictionErrorDistributionDictionary[donorPeak.SpectraFileInfo], rtPredictionError); - } - - /// - /// Calculates the Ppm error score for a given acceptor by comparing the ppm error for the given peak - /// to the ppm error of all non-MBR peaks in the acceptor file - /// - /// Score bounded by 0 and 1, where higher scores are better - internal double CalculatePpmErrorScore(ChromatographicPeak acceptorPeak) - { - return CalculateScore(_ppmDistribution, acceptorPeak.MassError); - } - - /// - /// Calculates the scan count score for a given acceptor by comparing the number of scans observed for the given peak - /// to the ppm error of all non-MBR peaks in the acceptor file - /// - /// Score bounded by 0 and 1, where higher scores are better - internal double CalculateScanCountScore(ChromatographicPeak acceptorPeak) - { - return CalculateScore(_scanCountDistribution, acceptorPeak.ScanCount); } /// @@ -162,7 +198,7 @@ internal void CalculateFoldChangeBetweenFiles(List idDonorP var acceptorFileBestMsmsPeaks = new Dictionary(); // get the best (most intense) peak for each peptide in the acceptor file - foreach (ChromatographicPeak acceptorPeak in UnambiguousMsMsPeaks) + foreach (ChromatographicPeak acceptorPeak in UnambiguousMsMsAcceptorPeaks) { if (acceptorFileBestMsmsPeaks.TryGetValue(acceptorPeak.Identifications.First().ModifiedSequence, out ChromatographicPeak currentBestPeak)) { @@ -199,5 +235,18 @@ internal void CalculateFoldChangeBetweenFiles(List idDonorP } } + /// + /// Determines whether or not the scorer is validly paramaterized and capable + /// of scoring MBR transfers originating from the given donorFile + /// + internal bool IsValid(SpectraFileInfo donorFile) + { + return _rtPredictionErrorDistributionDictionary.TryGetValue(donorFile, out var rtDist) + && rtDist != null + && _ppmDistribution != null + && _scanCountDistribution != null + && _logIntensityDistribution != null; + } + } } diff --git a/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs b/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs new file mode 100644 index 000000000..fbb8f429d --- /dev/null +++ b/mzLib/FlashLFQ/PEP/ChromatographicPeakData.cs @@ -0,0 +1,106 @@ +using Easy.Common.Extensions; +using Microsoft.ML.Data; +using System.Collections.Generic; +using System.Collections.Immutable; +using System.Text; + +namespace FlashLFQ.PEP +{ + public class ChromatographicPeakData + { + public static readonly IImmutableDictionary trainingInfos = new Dictionary + { + { "standard", new [] + { + "PpmErrorScore", + "IntensityScore", + "RtScore", + "ScanCountScore", + "IsotopicDistributionScore", + "PpmErrorRaw", + "IntensityRaw", + "RtPredictionErrorRaw", + "ScanCountRaw", + "IsotopicPearsonCorrelation" + } + }, + { "reduced", new [] + { + "PpmErrorRaw", + "IntensityRaw", + "RtPredictionErrorRaw", + "ScanCountRaw", + "IsotopicPearsonCorrelation" + } + }, + }.ToImmutableDictionary(); + + /// + /// These are used for percolator. Trainer must be told the assumed direction for each attribute as it relates to being a true positive + /// Here, a weight of 1 indicates that the probability of being true is for higher numbers in the set. + /// A weight of -1 indicates that the probability of being true is for the lower numbers in the set. + /// + public static readonly IImmutableDictionary assumedAttributeDirection = new Dictionary { + { "PpmErrorScore", 1 }, + { "IntensityScore", 1 }, + { "RtScore", 1 }, + { "ScanCountScore", 1 }, + { "IsotopicDistributionScore", 1 }, + { "PpmErrorRaw", -1 }, + { "IntensityRaw", 1 }, + { "RtPredictionErrorRaw", -1 }, + { "ScanCountRaw", -1 }, + { "IsotopicPearsonCorrelation", 1 } + }.ToImmutableDictionary(); + + public string ToString(string searchType) + { + StringBuilder sb = new StringBuilder(); + var variablesToOutput = ChromatographicPeakData.trainingInfos[searchType]; + + foreach (var variable in variablesToOutput) + { + var property = typeof(ChromatographicPeakData).GetProperty(variable).GetValue(this, null); + var floatValue = (float)property; + sb.Append("\t"); + sb.Append(floatValue.ToString()); + } + + return sb.ToString(); + } + + [LoadColumn(0)] + public float PpmErrorScore { get; set; } + + [LoadColumn(1)] + public float IntensityScore { get; set; } + + [LoadColumn(2)] + public float RtScore { get; set; } + + [LoadColumn(3)] + public float ScanCountScore { get; set; } + + [LoadColumn(4)] + public float IsotopicDistributionScore { get; set; } + + [LoadColumn(5)] + public float PpmErrorRaw { get; set; } + + [LoadColumn(6)] + public float IntensityRaw { get; set; } + + [LoadColumn(7)] + public float RtPredictionErrorRaw { get; set; } + + [LoadColumn(8)] + public float ScanCountRaw { get; set; } + + [LoadColumn(9)] + public float IsotopicPearsonCorrelation { get; set; } + + [LoadColumn(10)] + public bool Label { get; set; } + + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/PEP/DonorGroup.cs b/mzLib/FlashLFQ/PEP/DonorGroup.cs new file mode 100644 index 000000000..351bdee90 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/DonorGroup.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ.PEP +{ + /// + /// This class represents a group of chromatographic peaks that are associated with a donor identification. + /// During MBR, one donor identification is associated with multiple acceptor identifications, with both + /// predicted retention times (good MBR transfers) and random retention times (decoy MBR transfers). + /// This class groups them together for the purpose of cross-validation/PEP scoring + /// + public class DonorGroup : IEnumerable + { + public Identification DonorId { get; } + public List TargetAcceptors { get; } + public List DecoyAcceptors { get; } + + public DonorGroup(Identification donorId, List targetAcceptors, List decoyAcceptors) + { + DonorId = donorId; + TargetAcceptors = targetAcceptors; + DecoyAcceptors = decoyAcceptors; + } + + public double BestTargetMbrScore => TargetAcceptors.Count == 0 ? 0 : TargetAcceptors.Max(acceptor => acceptor.MbrScore); + + public IEnumerator GetEnumerator() + { + return TargetAcceptors.Concat(DecoyAcceptors).GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return GetEnumerator(); + } + + } +} diff --git a/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs b/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs new file mode 100644 index 000000000..eaddae3e8 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/PepAnalysisEngine.cs @@ -0,0 +1,647 @@ +using Microsoft.ML; +using Microsoft.ML.Data; +using Microsoft.ML.Trainers.FastTree; +using System; +using System.Collections.Concurrent; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Omics; +using System.Collections; +using System.Security.Policy; +using System.Text.RegularExpressions; +using System.Reflection; + +namespace FlashLFQ.PEP +{ + public class PepAnalysisEngine + { + public double PipScoreCutoff; + + private static int _randomSeed = 42; + + /// + /// This method contains the hyper-parameters that will be used when training the machine learning model + /// + /// Options object to be passed in to the FastTree constructor + public FastTreeBinaryTrainer.Options BGDTreeOptions => + new FastTreeBinaryTrainer.Options + { + NumberOfThreads = 1, + NumberOfTrees = 100, + MinimumExampleCountPerLeaf = 10, + NumberOfLeaves = 20, + LearningRate = 0.2, + LabelColumnName = "Label", + FeatureColumnName = "Features", + Seed = _randomSeed, + FeatureSelectionSeed = _randomSeed, + RandomStart = false, + UnbalancedSets = true + }; + + public List Peaks { get; } + public string OutputFolder { get; set; } + public int MaxThreads { get; set; } + public double PepTrainingFraction { get; set; } + + public PepAnalysisEngine(List peaks, string outputFolder, int maxThreads, double pepTrainingFraction = 0.25) + { + Peaks = peaks; + OutputFolder = outputFolder; + MaxThreads = maxThreads; + PepTrainingFraction = pepTrainingFraction; + } + + public string ComputePEPValuesForAllPeaks() + { + string[] trainingVariables = ChromatographicPeakData.trainingInfos["standard"]; + + #region Construct Donor Groups + // this is target peak not target peptide + List donors= new(); + foreach(var donorGroup in Peaks + .Where(peak => peak.IsMbrPeak) + .OrderByDescending(peak => peak.MbrScore) + .GroupBy(peak => peak.Identifications.First())) //Group by donor peptide + { + var donorId = donorGroup.Key; + var targetAcceptors = donorGroup.Where(peak => !peak.RandomRt).ToList(); + var decoyAcceptors = donorGroup.Where(peak => peak.RandomRt).ToList(); + donors.Add(new DonorGroup(donorId, targetAcceptors, decoyAcceptors)); + } + + // Fix the order + donors = OrderDonorGroups(donors); + + var peakScores = donors.SelectMany(donor => donor.Select(p => p.MbrScore)).OrderByDescending(score => score).ToList(); + PipScoreCutoff = peakScores[(int)Math.Floor(peakScores.Count * PepTrainingFraction)]; //Select the top N percent of all peaks, only use those as positive examples + + MLContext mlContext = new MLContext(_randomSeed); + //the number of groups used for cross-validation is hard-coded at three. Do not change this number without changing other areas of effected code. + const int numGroups = 3; + + List[] donorGroupIndices = GetDonorGroupIndices(donors, numGroups, PipScoreCutoff); + + #endregion + + #region Create Groups and Model + IEnumerable[] ChromatographicPeakDataGroups = new IEnumerable[numGroups]; + for (int i = 0; i < numGroups; i++) + { + ChromatographicPeakDataGroups[i] = CreateChromatographicPeakData(donors, donorGroupIndices[i], MaxThreads); + + if (!ChromatographicPeakDataGroups[i].Any(p => p.Label == true) + || !ChromatographicPeakDataGroups[i].Any(p => p.Label == false)) + { + return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; + } + } + + TransformerChain>>[] trainedModels + = new TransformerChain>>[numGroups]; + + var trainer = mlContext.BinaryClassification.Trainers.FastTree(BGDTreeOptions); + var pipeline = mlContext.Transforms.Concatenate("Features", trainingVariables) + .Append(trainer); + + List allMetrics = new List(); + + #endregion + + #region Training and Cross Validation First iteration + + for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) + { + + List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); + allGroupIndexes.RemoveAt(groupIndexNumber); + + //concat doesn't work in a loop, therefore I had to hard code the concat to group 3 out of 4 lists. if the const int numGroups value is changed, then the concat has to be changed accordingly. + IDataView dataView = mlContext.Data.LoadFromEnumerable( + ChromatographicPeakDataGroups[allGroupIndexes[0]] + .Concat(ChromatographicPeakDataGroups[allGroupIndexes[1]])); + + trainedModels[groupIndexNumber] = pipeline.Fit(dataView); + var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(ChromatographicPeakDataGroups[groupIndexNumber])); + CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + + //Parallel operation of the following code requires the method to be stored and then read, once for each thread + //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation + if (OutputFolder != null) + { + mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(OutputFolder, "model.zip")); + } + + Compute_PEP_For_All_Peaks(donors, donorGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], OutputFolder, MaxThreads); + + allMetrics.Add(metrics); + } + + #endregion + #region Iterative Training + + for(int trainingIteration = 0; trainingIteration < 9; trainingIteration++) + { + ChromatographicPeakDataGroups = new IEnumerable[numGroups]; + for (int i = 0; i < numGroups; i++) + { + ChromatographicPeakDataGroups[i] = CreateChromatographicPeakDataIteration(donors, donorGroupIndices[i], MaxThreads); + + if (!ChromatographicPeakDataGroups[i].Any(p => p.Label == true) + || !ChromatographicPeakDataGroups[i].Any(p => p.Label == false)) + { + return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples."; + } + } + + for (int groupIndexNumber = 0; groupIndexNumber < numGroups; groupIndexNumber++) + { + List allGroupIndexes = Enumerable.Range(0, numGroups).ToList(); + allGroupIndexes.RemoveAt(groupIndexNumber); + + IDataView dataView = mlContext.Data.LoadFromEnumerable( + ChromatographicPeakDataGroups[allGroupIndexes[0]] + .Concat(ChromatographicPeakDataGroups[allGroupIndexes[1]])); + + trainedModels[groupIndexNumber] = pipeline.Fit(dataView); + var myPredictions = trainedModels[groupIndexNumber].Transform(mlContext.Data.LoadFromEnumerable(ChromatographicPeakDataGroups[groupIndexNumber])); + CalibratedBinaryClassificationMetrics metrics = mlContext.BinaryClassification.Evaluate(data: myPredictions, labelColumnName: "Label", scoreColumnName: "Score"); + + //Parallel operation of the following code requires the method to be stored and then read, once for each thread + //if not output directory is specified, the model cannot be stored, and we must force single-threaded operation + if (OutputFolder != null) + { + mlContext.Model.Save(trainedModels[groupIndexNumber], dataView.Schema, Path.Combine(OutputFolder, "model.zip")); + } + + Compute_PEP_For_All_Peaks(donors, donorGroupIndices[groupIndexNumber], mlContext, trainedModels[groupIndexNumber], OutputFolder, MaxThreads); + + allMetrics.Add(metrics); + } + } + #endregion + + return AggregateMetricsForOutput(allMetrics); + } + + public static List OrderDonorGroups(List donors) + { + return donors.OrderByDescending(donor => donor.TargetAcceptors.Count) + .ThenByDescending(donor => donor.DecoyAcceptors.Count) + .ThenByDescending(donor => donor.BestTargetMbrScore) + .ToList(); + } + + //we add the indexes of the targets and decoys to the groups separately in the hope that we'll get at least one target and one decoy in each group. + //then training can possibly be more successful. + public static List[] GetDonorGroupIndices(List donors, int numGroups, double scoreCutoff) + { + List[] groupsOfIndices = new List[numGroups]; + for (int i = 0; i < numGroups; i++) + { + groupsOfIndices[i] = new List(); + } + + int myIndex = 0; + + while (myIndex < donors.Count) + { + int subIndex = 0; + while (subIndex < numGroups && myIndex < donors.Count) + { + groupsOfIndices[subIndex].Add(myIndex); + + subIndex++; + myIndex++; + } + } + + EqualizeDonorGroupIndices(donors, groupsOfIndices, scoreCutoff, numGroups); + + return groupsOfIndices; + } + + /// + /// Equalizes partitions used for cross-validation. The goal is to have the same number of targets and decoys in each partition + /// + /// List of all DonorGroups to be classified + /// An array of lists. Each list contains the indices of donor groups for a given partition + /// The MBR Score cutoff that determines which MBR target peaks will be used as positive training examples + /// /// Number of groups used for cross-validation, default = 3 + public static void EqualizeDonorGroupIndices(List donors, List[] groupsOfIndices, double scoreCutoff, int numGroups = 3) + { + HashSet swappedDonors = new HashSet(); // Keep track of everything we've swapped so we don't swap it again + // Outer loop iterates over the groups of indices (partitions) three times + // after each inner loop iterations, the number of ttargtes and decoys in each adjacent group is equal, but commonly group 1 and 3 will have a different number + // of targets and decoys. Looping three times should resolve this + for (int i = 0; i < numGroups*3 - 1; i++) + { + int groupA = i % numGroups; + int groupB = (i + 1) % numGroups; + int targetsA = 0; + int targetsB = 0; + int decoysA = 0; + int decoysB = 0; + foreach (int index in groupsOfIndices[groupA]) + { + targetsA += donors[index].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff); + decoysA += donors[index].DecoyAcceptors.Count; + } + foreach (int index in groupsOfIndices[groupB]) + { + targetsB += donors[index].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff); + decoysB += donors[index].DecoyAcceptors.Count; + } + + bool stuck = false; + int outerIterations = 0; + int minIndex = groupsOfIndices[groupA].Min(); + + // Calculate the difference in targets and decoys between the two groups + int targetSurplus = targetsA - targetsB; + int decoySurplus = decoysA - decoysB; + + while ((Math.Abs(targetSurplus) > 1 | Math.Abs(decoySurplus) > 1) && !stuck && outerIterations < 3) + { + bool swapped = false; + outerIterations++; + + int innerIterations = 0; + // start from the bottom of group 1, trying to swap peaks. + // If group 1 has more targets than group 2, we want to swap groups to equalize the number of targets in each group + while (Math.Abs(targetSurplus) > 1 & !stuck & innerIterations < 3) + { + innerIterations++; + swapped = false; + // Traverse the list of donor indices in descending order, looking for a good candidate to swap + foreach (int donorIndexA in groupsOfIndices[groupA].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + int donorIndexATargetCount = donors[donorIndexA].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff); + switch (targetSurplus > 0) + { + case true: // i.e., too many targets + if (donorIndexATargetCount < 1) continue; // No targets to swap + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff) < donorIndexATargetCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + case false: // i.e., too few targets + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore > scoreCutoff) > donorIndexATargetCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + } + + // If we reach the index of the list of donorGroups, set stuck to true so that the outer loop will break + if (donorIndexA == minIndex) + { + stuck = true; + break; + } + if (swapped) + break; + + } + } + + innerIterations = 0; + // Now we'll do the decoys + while (Math.Abs(decoySurplus) > 1 & !stuck & innerIterations < 3) + { + innerIterations++; + swapped = false; + foreach (int donorIndexA in groupsOfIndices[groupA].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + int donorIndexADecoyCount = donors[donorIndexA].DecoyAcceptors.Count(); + switch (decoySurplus > 0) + { + case true: // i.e., too many decoys + if (donorIndexADecoyCount < 1) continue; // No decoys to swap + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].DecoyAcceptors.Count() < donorIndexADecoyCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + case false: // i.e., too few decoys + foreach (int donorIndexB in groupsOfIndices[groupB].Where(idx => !swappedDonors.Contains(idx)).OrderByDescending(idx => idx)) + { + if (donors[donorIndexB].DecoyAcceptors.Count() > donorIndexADecoyCount) + { + GroupSwap(donors, groupsOfIndices, donorIndexA, donorIndexB, groupA, groupB, + scoreCutoff, swappedDonors, ref targetSurplus, ref decoySurplus); + swapped = true; + break; + } + } + break; + } + + // If we reach the index of the list of donorGroups, set stuck to true so that the outer loop will break + if (donorIndexA == minIndex) + { + stuck = true; + break; + } + if (swapped) + break; + } + } + } + } + } + + /// + /// Takes in a list of donor groups and a list of indices for each group, and swaps two groups of indices + /// Updates the targetSurplus and decoySurplus variables + /// Updates the swappedDonors hash set to keep track of which donors have been swapped + /// This is done to equalize the number of targets and decoys in each paritition for cross validation + /// + public static void GroupSwap( + List donors, + List[] groupsOfIndices, + int donorIndexA, + int donorIndexB, + int groupsOfIndicesIndexA, + int groupsOfIndicesIndexB, + double scoreCutoff, + HashSet swappedDonors, + ref int targetSurplus, + ref int decoySurplus) + { + // Multiply by two because the surplus is the difference between the two groups + // So removing one peak from one group and adding it to the other group is a difference of two + targetSurplus += 2 * ( + donors[donorIndexB].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff) - + donors[donorIndexA].TargetAcceptors.Count(peak => peak.MbrScore >= scoreCutoff)); + decoySurplus += 2 * ( + donors[donorIndexB].DecoyAcceptors.Count - + donors[donorIndexA].DecoyAcceptors.Count); + + groupsOfIndices[groupsOfIndicesIndexA].Add(donorIndexB); + groupsOfIndices[groupsOfIndicesIndexA].Remove(donorIndexA); + groupsOfIndices[groupsOfIndicesIndexB].Add(donorIndexA); + groupsOfIndices[groupsOfIndicesIndexB].Remove(donorIndexB); + } + + /// + /// Creates chromatographic peak data that will be used to train the machine learning model + /// Classifies peaks as positive or negative training examples + /// Positive training examples are peaks with MBR scores above the 25th percentile, + /// Negative training examples are peaks with random retention times + /// + /// The list of donor groups. + /// The list of donor indices. + /// The maximum number of threads. + /// The enumerable of chromatographic peak data. + public IEnumerable CreateChromatographicPeakData(List donors, List donorIndices, int maxThreads) + { + object ChromatographicPeakDataListLock = new object(); + List ChromatographicPeakDataList = new List(); + int[] threads = Enumerable.Range(0, maxThreads).ToArray(); + + List pipScores = new(); + foreach(int i in donorIndices) + { + pipScores.AddRange(donors[i].Select(peak => peak.MbrScore)); + } + pipScores.Sort((a, b) => b.CompareTo(a)); // This is a descending sort + double groupSpecificPipScoreCutoff = pipScores[(int)Math.Floor(pipScores.Count * 0.25)]; + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + List localChromatographicPeakDataList = new List(); + for (int i = range.Item1; i < range.Item2; i++) + { + var donor = donors[donorIndices[i]]; + foreach (var peak in donor) + { + ChromatographicPeakData newChromatographicPeakData = new ChromatographicPeakData(); + if (peak.RandomRt) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: false); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + else if (!peak.RandomRt & peak.MbrScore >= groupSpecificPipScoreCutoff) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: true); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + } + } + lock (ChromatographicPeakDataListLock) + { + ChromatographicPeakDataList.AddRange(localChromatographicPeakDataList); + } + }); + + ChromatographicPeakData[] pda = ChromatographicPeakDataList.ToArray(); + + return pda.AsEnumerable(); + } + + /// + /// Creates chromatographic peak data, but uses PEP values instead of MBR scores to select the positive training examples + /// + /// The list of donor groups. + /// The list of donor indices. + /// The maximum number of threads. + /// The enumerable of chromatographic peak data. + public IEnumerable CreateChromatographicPeakDataIteration(List donors, List donorIndices, int maxThreads) + { + object ChromatographicPeakDataListLock = new object(); + List ChromatographicPeakDataList = new List(); + int[] threads = Enumerable.Range(0, maxThreads).ToArray(); + + List peps = new(); + foreach (int i in donorIndices) + { + peps.AddRange(donors[i].Select(peak => peak.MbrPep ?? 1)); + } + peps.Sort(); + double groupSpecificPepCutoff = peps[(int)Math.Floor(peps.Count * 0.25)]; + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + List localChromatographicPeakDataList = new List(); + for (int i = range.Item1; i < range.Item2; i++) + { + var donor = donors[donorIndices[i]]; + foreach (var peak in donor) + { + ChromatographicPeakData newChromatographicPeakData = new ChromatographicPeakData(); + if (peak.RandomRt) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: false); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + else if (!peak.RandomRt & peak.MbrPep <= groupSpecificPepCutoff) + { + newChromatographicPeakData = CreateOneChromatographicPeakDataEntry(peak, label: true); + localChromatographicPeakDataList.Add(newChromatographicPeakData); + } + } + } + lock (ChromatographicPeakDataListLock) + { + ChromatographicPeakDataList.AddRange(localChromatographicPeakDataList); + } + }); + + ChromatographicPeakData[] pda = ChromatographicPeakDataList.ToArray(); + + return pda.AsEnumerable(); + } + + public static void Compute_PEP_For_All_Peaks( + List donors, + List donorIndices, + MLContext mLContext, + TransformerChain>> trainedModel, + string outputFolder, int maxThreads) + { + object lockObject = new object(); + + //the trained model is not threadsafe. Therefore, to use the same model for each thread saved the model to disk. Then each thread reads its own copy of the model back from disk. + //If there is no output folder specified, then this can't happen. We set maxthreads eqaul to one and use the model that gets passed into the method. + if (String.IsNullOrEmpty(outputFolder)) + { + maxThreads = 1; + } + + Parallel.ForEach(Partitioner.Create(0, donorIndices.Count), + new ParallelOptions { MaxDegreeOfParallelism = maxThreads }, + (range, loopState) => + { + + ITransformer threadSpecificTrainedModel; + if (maxThreads == 1) + { + threadSpecificTrainedModel = trainedModel; + } + else + { + threadSpecificTrainedModel = mLContext.Model.Load(Path.Combine(outputFolder, "model.zip"), out DataViewSchema savedModelSchema); + } + + // one prediction engine per thread, because the prediction engine is not thread-safe + var threadPredictionEngine = mLContext.Model.CreatePredictionEngine(threadSpecificTrainedModel); + + for (int i = range.Item1; i < range.Item2; i++) + { + DonorGroup donor = donors[donorIndices[i]]; + + foreach(ChromatographicPeak peak in donor) + { + ChromatographicPeakData pd = CreateOneChromatographicPeakDataEntry(peak, label: !peak.RandomRt); + var pepValuePrediction = threadPredictionEngine.Predict(pd); + peak.MbrPep = 1 - pepValuePrediction.Probability; + } + } + }); + } + + public static string AggregateMetricsForOutput(List allMetrics) + { + List accuracy = allMetrics.Select(m => m.Accuracy).ToList(); + List areaUnderRocCurve = allMetrics.Select(m => m.AreaUnderRocCurve).ToList(); + List areaUnderPrecisionRecallCurve = allMetrics.Select(m => m.AreaUnderPrecisionRecallCurve).ToList(); + List F1Score = allMetrics.Select(m => m.F1Score).ToList(); + List logLoss = allMetrics.Select(m => m.LogLoss).ToList(); + List logLossReduction = allMetrics.Select(m => m.LogLossReduction).ToList(); + List positivePrecision = allMetrics.Select(m => m.PositivePrecision).ToList(); + List positiveRecall = allMetrics.Select(m => m.PositiveRecall).ToList(); + List negativePrecision = allMetrics.Select(m => m.NegativePrecision).ToList(); + List negativeRecall = allMetrics.Select(m => m.NegativeRecall).ToList(); + + // log-loss can stochastically take on a value of infinity. + // correspondingly, log-loss reduction can be negative infinity. + // when this happens for one or more of the metrics, it can lead to uninformative numbers. + // so, unless they are all infinite, we remove them from the average. If they are all infinite, we report that. + + logLoss.RemoveAll(x => x == Double.PositiveInfinity); + logLossReduction.RemoveAll(x => x == Double.NegativeInfinity); + + double logLossAverage = Double.PositiveInfinity; + double logLossReductionAverage = Double.NegativeInfinity; + + if ((logLoss != null) && (logLoss.Any())) + { + logLossAverage = logLoss.Average(); + } + + if ((logLossReduction != null) && (logLossReduction.Any())) + { + logLossReductionAverage = logLossReduction.Average(); + } + + StringBuilder s = new StringBuilder(); + s.AppendLine(); + s.AppendLine("************************************************************"); + s.AppendLine("* Metrics for Determination of PEP Using Binary Classification "); + s.AppendLine("*-----------------------------------------------------------"); + s.AppendLine("* Accuracy: " + accuracy.Average().ToString()); + s.AppendLine("* Area Under Curve: " + areaUnderRocCurve.Average().ToString()); + s.AppendLine("* Area under Precision recall Curve: " + areaUnderPrecisionRecallCurve.Average().ToString()); + s.AppendLine("* F1Score: " + F1Score.Average().ToString()); + s.AppendLine("* LogLoss: " + logLossAverage.ToString()); + s.AppendLine("* LogLossReduction: " + logLossReductionAverage.ToString()); + s.AppendLine("* PositivePrecision: " + positivePrecision.Average().ToString()); + s.AppendLine("* PositiveRecall: " + positiveRecall.Average().ToString()); + s.AppendLine("* NegativePrecision: " + negativePrecision.Average().ToString()); + s.AppendLine("* NegativeRecall: " + negativeRecall.Average().ToString()); + s.AppendLine("************************************************************"); + return s.ToString(); + } + + public static ChromatographicPeakData CreateOneChromatographicPeakDataEntry(ChromatographicPeak peak,bool label) + { + + peak.PepPeakData = new ChromatographicPeakData + { + PpmErrorScore = (float)peak.PpmScore, + IntensityScore = (float)peak.IntensityScore, + RtScore = (float)peak.RtScore, + ScanCountScore = (float)peak.ScanCountScore, + IsotopicDistributionScore = (float)peak.IsotopicDistributionScore, + + PpmErrorRaw = (float)Math.Abs(peak.MassError), + IntensityRaw = (float)Math.Log2(peak.Intensity), + RtPredictionErrorRaw = (float)Math.Abs(peak.RtPredictionError), + ScanCountRaw = (float)peak.IsotopicEnvelopes.Count, + IsotopicPearsonCorrelation = (float)(peak.IsotopicPearsonCorrelation), + + Label = label, + + }; + + return peak.PepPeakData; + } + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs b/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs new file mode 100644 index 000000000..5837959d8 --- /dev/null +++ b/mzLib/FlashLFQ/PEP/TruePositivePrediction.cs @@ -0,0 +1,18 @@ +using Microsoft.ML.Data; + +namespace FlashLFQ.PEP +{ + public class TruePositivePrediction + { + // ColumnName attribute is used to change the column name from + // its default value, which is the name of the field. + [ColumnName("PredictedLabel")] + public bool Prediction; + + // No need to specify ColumnName attribute, because the field + // name "Probability" is the column name we want. + public float Probability; + + public float Score; + } +} \ No newline at end of file diff --git a/mzLib/FlashLFQ/Peptide.cs b/mzLib/FlashLFQ/Peptide.cs index 39088d35a..c8bb6b64b 100644 --- a/mzLib/FlashLFQ/Peptide.cs +++ b/mzLib/FlashLFQ/Peptide.cs @@ -1,5 +1,7 @@ -using System.Collections.Generic; +using Easy.Common.Extensions; +using System.Collections.Generic; using System.Linq; +using System.Runtime.CompilerServices; using System.Text; namespace FlashLFQ @@ -67,6 +69,18 @@ public void SetIntensity(SpectraFileInfo fileInfo, double intensity) } } + public double GetTotalIntensity() + { + if (Intensities.IsNotNullOrEmpty()) + { + return Intensities.Sum(i => i.Value); + } + else + { + return 0; + } + } + public DetectionType GetDetectionType(SpectraFileInfo fileInfo) { if (DetectionTypes.TryGetValue(fileInfo, out DetectionType detectionType)) diff --git a/mzLib/FlashLFQ/RtInfo.cs b/mzLib/FlashLFQ/RtInfo.cs index 0750e4588..ceda038e4 100644 --- a/mzLib/FlashLFQ/RtInfo.cs +++ b/mzLib/FlashLFQ/RtInfo.cs @@ -9,15 +9,10 @@ namespace FlashLFQ public class RtInfo { public double PredictedRt { get; } - public double Width { get; } + public double Width { get; set; } public double RtStartHypothesis => PredictedRt - (Width / 2.0); public double RtEndHypothesis => PredictedRt + (Width / 2.0); - // These will be introduced in a later PR. For now, we're sticking with the classic version - //private double _minimumWindowWidth = 0.5; - //public double RtStartHypothesis => PredictedRt - Math.Max((Width / 2.0), _minimumWindowWidth/2); // the Math.Max components ensure that the width of an RT Window is at least _minimumWindowWidth wide - //public double RtEndHypothesis => PredictedRt + Math.Max((Width / 2.0), _minimumWindowWidth/2); - public RtInfo(double predictedRt, double width) { PredictedRt = predictedRt; diff --git a/mzLib/FlashLFQ/SpectraFileInfo.cs b/mzLib/FlashLFQ/SpectraFileInfo.cs index 5dd5b4ab2..9cc24b6d7 100644 --- a/mzLib/FlashLFQ/SpectraFileInfo.cs +++ b/mzLib/FlashLFQ/SpectraFileInfo.cs @@ -1,4 +1,6 @@ -namespace FlashLFQ +using System.IO; + +namespace FlashLFQ { public class SpectraFileInfo { @@ -39,5 +41,9 @@ public override int GetHashCode() { return FullFilePathWithExtension.GetHashCode(); } + public override string ToString() + { + return Path.GetFileName(FullFilePathWithExtension); + } } } \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs index 4a919a7e5..8f7bb320b 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs @@ -1,20 +1,17 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Text; -using System.Threading.Tasks; using Chemistry; -using Easy.Common.Extensions; using MathNet.Numerics.Statistics; using MzLibUtil; namespace MassSpectrometry { - public class ClassicDeconvolutionAlgorithm : DeconvolutionAlgorithm + internal class ClassicDeconvolutionAlgorithm : DeconvolutionAlgorithm { private MzSpectrum spectrum; - public ClassicDeconvolutionAlgorithm(DeconvolutionParameters deconParameters) : base(deconParameters) + internal ClassicDeconvolutionAlgorithm(DeconvolutionParameters deconParameters) : base(deconParameters) { } @@ -25,7 +22,7 @@ public ClassicDeconvolutionAlgorithm(DeconvolutionParameters deconParameters) : /// spectrum to deconvolute /// Range of peaks to deconvolute /// - public override IEnumerable Deconvolute(MzSpectrum spectrumToDeconvolute, MzRange range) + internal override IEnumerable Deconvolute(MzSpectrum spectrumToDeconvolute, MzRange range) { var deconParams = DeconvolutionParameters as ClassicDeconvolutionParameters ?? throw new MzLibException("Deconvolution params and algorithm do not match"); spectrum = spectrumToDeconvolute; @@ -205,7 +202,7 @@ private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateFor } } - return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, Statistics.StandardDeviation(listOfRatios), massIndex); + return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, listOfRatios.StandardDeviation()); } private int ObserveAdjacentChargeStates(IsotopicEnvelope originalEnvelope, double mostIntensePeakMz, int massIndex, double deconvolutionTolerancePpm, double intensityRatioLimit, double minChargeToLookFor, double maxChargeToLookFor, List monoisotopicMassPredictions) diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/DeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/DeconvolutionAlgorithm.cs index e8a052e39..efb8cd248 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/DeconvolutionAlgorithm.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/DeconvolutionAlgorithm.cs @@ -1,13 +1,14 @@ using System; using System.Collections.Generic; using System.Linq; -using System.Text; -using System.Threading.Tasks; using Chemistry; using MzLibUtil; namespace MassSpectrometry { + /// + /// Parent class defining minimum requirement to be used + /// public abstract class DeconvolutionAlgorithm { // For ClassicDeconv. If not used elsewhere, move to that class @@ -79,6 +80,6 @@ protected DeconvolutionAlgorithm(DeconvolutionParameters deconParameters) /// spectrum to be deconvoluted /// Range of peaks to deconvolute /// - public abstract IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range); + internal abstract IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range); } } diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ExampleNewDeconvolutionAlgorithmTemplate.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ExampleNewDeconvolutionAlgorithmTemplate.cs index 18957d8d0..c70c10b63 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ExampleNewDeconvolutionAlgorithmTemplate.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ExampleNewDeconvolutionAlgorithmTemplate.cs @@ -1,22 +1,19 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; -using System.Linq; -using System.Text; -using System.Threading.Tasks; using MzLibUtil; namespace MassSpectrometry { [ExcludeFromCodeCoverage] - public class ExampleNewDeconvolutionAlgorithmTemplate : DeconvolutionAlgorithm + internal class ExampleNewDeconvolutionAlgorithmTemplate : DeconvolutionAlgorithm { - public ExampleNewDeconvolutionAlgorithmTemplate(DeconvolutionParameters deconParameters) : base(deconParameters) + internal ExampleNewDeconvolutionAlgorithmTemplate(DeconvolutionParameters deconParameters) : base(deconParameters) { } - public override IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range = null) + internal override IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range = null) { var deconParams = DeconvolutionParameters as ExampleNewDeconvolutionParametersTemplate ?? throw new MzLibException("Deconvolution params and algorithm do not match"); range ??= spectrum.Range; diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/IsoDecAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/IsoDecAlgorithm.cs new file mode 100644 index 000000000..6c0dab348 --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/IsoDecAlgorithm.cs @@ -0,0 +1,159 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Runtime.InteropServices; +using MzLibUtil; + +namespace MassSpectrometry +{ + /// + /// Performs deconvolution on a single spectrum or region of spectrum using the Isodec algorithm + /// + /// Isodec only needs to region of interest and does not use surrounding charge states as references. + /// Isodec can report multiple monoisotopic masses for a single peak if enabled by ReportMultipleMonoisos parameter + /// In this case, the resulting isotopic envelopes will have the same precursor ID. + /// + /// + internal class IsoDecAlgorithm : DeconvolutionAlgorithm + { + internal IsoDecAlgorithm(DeconvolutionParameters deconParameters) : base(deconParameters) + { + + } + + /// + /// Struct passed by pointer in memory to the Isodec.dll + /// + [StructLayout(LayoutKind.Sequential, Pack =1)] + public struct MatchedPeak + { + public float mz; + public int z; + public float monoiso; + public float peakmass; + public float avgmass; + public float area; + public float peakint; + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 64)] + public float[] matchedindsiso; + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 64)] + public float[] matchedindsexp; + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 64)] + public float[] isomz; + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 64)] + public float[] isodist; + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 64)] + public float[] isomass; + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 16)] + public float[] monoisos; + int startindex; + int endindex; + public float score; + public int realisolength; + } + + /// + /// Calls the Isodec.dll to perform deconvolution on the given spectrum + /// The Isodec.dll requires three other dll's as dependencies: isogenmass.dll, libmmd.dll, scml_dispmd.dll + /// + /// + /// + /// + /// + /// + /// + /// + + [DllImport("isodeclib.dll", EntryPoint = "process_spectrum", CallingConvention = CallingConvention.Cdecl)] + protected static extern int process_spectrum(double[] cmz, float[] cintensity, int c, string fname, IntPtr matchedpeaks, IsoDecDeconvolutionParameters.IsoSettings settings); + + internal override IEnumerable Deconvolute(MzSpectrum spectrum, MzRange range) + { + var deconParams = DeconvolutionParameters as IsoDecDeconvolutionParameters ?? throw new MzLibException("Deconvolution params and algorithm do not match"); + + var firstIndex = spectrum.GetClosestPeakIndex(range.Minimum); + var lastIndex = spectrum.GetClosestPeakIndex(range.Maximum); + + var mzs = spectrum.XArray[firstIndex..lastIndex] + .Select(p => p) + .ToArray(); + var intensities = spectrum.YArray[firstIndex..lastIndex] + .Select(p => (float)p) + .ToArray(); + + var mpArray = new byte[intensities.Length * Marshal.SizeOf(typeof(MatchedPeak))]; + GCHandle handle = GCHandle.Alloc(mpArray, GCHandleType.Pinned); + try + { + IntPtr matchedPeaksPtr = (IntPtr)handle.AddrOfPinnedObject(); + IsoDecDeconvolutionParameters.IsoSettings settings = deconParams.ToIsoSettings(); + int result = process_spectrum(mzs, intensities, intensities.Length, null, matchedPeaksPtr, settings); + if (result <= 0) + return Enumerable.Empty(); + + // Handle results + MatchedPeak[] matchedpeaks = new MatchedPeak[result]; + for (int i = 0; i < result; i++) + { + matchedpeaks[i] = Marshal.PtrToStructure(matchedPeaksPtr + i * Marshal.SizeOf(typeof(MatchedPeak))); + } + + return ConvertToIsotopicEnvelopes(deconParams, matchedpeaks, spectrum); + } + finally + { + handle.Free(); + } + } + + /// + /// Converts the isodec output (MatchedPeak) to IsotopicEnvelope for return + /// + /// + /// + /// + /// + private List ConvertToIsotopicEnvelopes(IsoDecDeconvolutionParameters parameters, MatchedPeak[] matchedpeaks, MzSpectrum spectrum) + { + List result = new List(); + int currentId = 0; + var tolerance = new PpmTolerance(5); + foreach(MatchedPeak peak in matchedpeaks) + { + List<(double,double)> peaks = new List<(double,double)> (); + for (int i = 0; i < peak.realisolength; i++) + { + + List indicesWithinTolerance = spectrum.GetPeakIndicesWithinTolerance(peak.isomz[i], tolerance); + double maxIntensity = 0; + int maxIndex = -1; + foreach (int index in indicesWithinTolerance) + { + if (spectrum.YArray[index] > maxIntensity) { maxIntensity = spectrum.YArray[index]; maxIndex = index; } + } + if (maxIndex >= 0) + { + peaks.Add((spectrum.XArray[maxIndex], spectrum.YArray[maxIndex])); + } + else + { + peaks.Add((peak.isomz[i], 0)); + } + + } + int charge = peak.z; + if(parameters.Polarity == Polarity.Negative) { charge = -peak.z; } + if(parameters.ReportMulitpleMonoisos) + { + foreach (float monoiso in peak.monoisos) + { + if (monoiso > 0) { result.Add(new IsotopicEnvelope(currentId, peaks, (double)monoiso, charge, peak.peakint, peak.score)); } + } + } + else { result.Add(new IsotopicEnvelope(currentId, peaks, (double)peak.monoiso, charge, peak.peakint, peak.score)); } + currentId++; + } + return result; + } + } +} diff --git a/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs b/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs index d419561f2..af32b78f9 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs @@ -1,20 +1,9 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; -using Easy.Common.Extensions; -using Easy.Common.Interfaces; +using System.Collections.Generic; +using Chemistry; using MzLibUtil; namespace MassSpectrometry { - public enum DeconvolutionType - { - ClassicDeconvolution, - ExampleNewDeconvolutionTemplate, - } - /// /// Context class for all deconvolution /// @@ -30,27 +19,9 @@ public static class Deconvoluter public static IEnumerable Deconvolute(MsDataScan scan, DeconvolutionParameters deconvolutionParameters, MzRange rangeToGetPeaksFrom = null) { - rangeToGetPeaksFrom ??= scan.MassSpectrum.Range; - - // set deconvolution algorithm and any specific deconvolution parameters found in the MsDataScan - DeconvolutionAlgorithm deconAlgorithm; - switch (deconvolutionParameters.DeconvolutionType) - { - case DeconvolutionType.ClassicDeconvolution: - deconAlgorithm = new ClassicDeconvolutionAlgorithm(deconvolutionParameters); - break; - - case DeconvolutionType.ExampleNewDeconvolutionTemplate: - deconAlgorithm = new ExampleNewDeconvolutionAlgorithmTemplate(deconvolutionParameters); - break; - - default: throw new MzLibException("DeconvolutionType not yet supported"); - } - - return deconAlgorithm.Deconvolute(scan.MassSpectrum, rangeToGetPeaksFrom); + // set any specific deconvolution parameters found only in the MsDataScan + return Deconvolute(scan.MassSpectrum, deconvolutionParameters, rangeToGetPeaksFrom); } - - /// /// Static deconvolution of an MzSpectrum that does not require Deconvoluter construction @@ -64,22 +35,53 @@ public static IEnumerable Deconvolute(MzSpectrum spectrum, { rangeToGetPeaksFrom ??= spectrum.Range; + // Short circuit deconvolution if it is called on a neutral mass spectrum + if (spectrum is NeutralMassSpectrum newt) + return DeconvoluteNeutralMassSpectrum(newt, rangeToGetPeaksFrom); + // set deconvolution algorithm - DeconvolutionAlgorithm deconAlgorithm; - switch (deconvolutionParameters.DeconvolutionType) + DeconvolutionAlgorithm deconAlgorithm = CreateAlgorithm(deconvolutionParameters); + + // Delegate deconvolution to the algorithm + return deconAlgorithm.Deconvolute(spectrum, rangeToGetPeaksFrom); + } + + /// + /// Factory method to create the correct deconvolution algorithm from the parameters + /// + /// + /// + /// + private static DeconvolutionAlgorithm CreateAlgorithm(DeconvolutionParameters parameters) + { + return parameters.DeconvolutionType switch { - case DeconvolutionType.ClassicDeconvolution: - deconAlgorithm = new ClassicDeconvolutionAlgorithm(deconvolutionParameters); - break; + DeconvolutionType.ClassicDeconvolution => new ClassicDeconvolutionAlgorithm(parameters), + DeconvolutionType.ExampleNewDeconvolutionTemplate => new ExampleNewDeconvolutionAlgorithmTemplate(parameters), + DeconvolutionType.IsoDecDeconvolution => new IsoDecAlgorithm(parameters), + _ => throw new MzLibException("DeconvolutionType not yet supported") + }; + } - case DeconvolutionType.ExampleNewDeconvolutionTemplate: - deconAlgorithm = new ExampleNewDeconvolutionAlgorithmTemplate(deconvolutionParameters); - break; + /// + /// Returns all peaks in the neutral mass spectrum as an isotopic envelope with a single peak + /// + /// + /// + /// + private static IEnumerable DeconvoluteNeutralMassSpectrum(NeutralMassSpectrum neutralSpectrum, MzRange range) + { + for (int i = 0; i < neutralSpectrum.XArray.Length; i++) + { + double neutralMass = neutralSpectrum.XArray[i]; + double intensity = neutralSpectrum.YArray[i]; + int chargeState = neutralSpectrum.Charges[i]; - default: throw new MzLibException("DeconvolutionType not yet supported"); + if (range.Contains(neutralMass.ToMz(chargeState))) + { + yield return new IsotopicEnvelope(neutralMass, intensity, chargeState); + } } - - return deconAlgorithm.Deconvolute(spectrum, rangeToGetPeaksFrom); } } } diff --git a/mzLib/MassSpectrometry/Deconvolution/DeconvolutionType.cs b/mzLib/MassSpectrometry/Deconvolution/DeconvolutionType.cs new file mode 100644 index 000000000..f2fece52e --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/DeconvolutionType.cs @@ -0,0 +1,15 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace MassSpectrometry +{ + public enum DeconvolutionType + { + ClassicDeconvolution, + ExampleNewDeconvolutionTemplate, + IsoDecDeconvolution, + } +} diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/IsoDecDeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/IsoDecDeconvolutionParameters.cs new file mode 100644 index 000000000..649ee9a11 --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/IsoDecDeconvolutionParameters.cs @@ -0,0 +1,214 @@ +using System.Runtime.InteropServices; + +namespace MassSpectrometry; +public class IsoDecDeconvolutionParameters : DeconvolutionParameters +{ + public override DeconvolutionType DeconvolutionType { get; protected set; } = DeconvolutionType.IsoDecDeconvolution; + + #region Interop Parameters + + /// + /// The struct that is passed into the isodec.dll + /// + public struct IsoSettings + { + public int phaseres; // Precision of encoding matrix + public int verbose; // Verbose output + public int peakwindow; // Peak Detection Window + public float peakthresh; // Peak Detection Threshold + public int minpeaks; // Minimum Peaks for an allowed peak + public float css_thresh; // Minimum cosine similarity score for isotope distribution + public float matchtol; // Match Tolerance for peak detection in ppm + public int maxshift; // Maximum shift allowed for isotope distribution + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 2)] + public float[] mzwindow; // MZ Window for isotope distribution + [MarshalAs(UnmanagedType.ByValArray, SizeConst = 2)] + public float[] plusoneintwindow; // Plus One Intensity range. Will be used for charge state 1 + public int knockdown_rounds; // Number of knockdown rounds + public float min_score_diff; // Minimum score difference for isotope distribution to allow missed monoisotopic peaks + public float minareacovered; // Minimum area covered by isotope distribution. Use in or with css_thresh + public int isolength; // Isotope Distribution Length + public double mass_diff_c; // Mass difference between isotopes + public float adductmass; // Adduct Mass + public int minusoneaszero; // Use set the -1 isotope as 0 to help force better alignments + public float isotopethreshold; // Threshold for isotope distribution. Will remove relative intensities below this. + public float datathreshold; // Threshold for data. Will remove relative intensities below this relative to max intensity in each cluster + public float zscore_threshold; //Ratio above which a secondary charge state prediction will be returned. + } + + private IsoSettings? _isoSettings; + + internal IsoSettings ToIsoSettings() + { + if (_isoSettings != null) + return _isoSettings.Value; + + IsoSettings result = new IsoSettings + { + phaseres = PhaseRes, + verbose = Verbose, + peakwindow = PeakWindow, + peakthresh = PeakThreshold, + minpeaks = MinPeaks, + css_thresh = CssThreshold, + matchtol = MatchTolerance, + maxshift = MaxShift, + mzwindow = MzWindow, + plusoneintwindow = PlusOneIntWindow, + knockdown_rounds = KnockdownRounds, + min_score_diff = MinScoreDiff, + minareacovered = MinAreaCovered, + isolength = IsoLength, + mass_diff_c = MassDiffC, + adductmass = AdductMass, + minusoneaszero = MinusOneAreasZero, + isotopethreshold = IsotopeThreshold, + datathreshold = DataThreshold, + zscore_threshold = ZScoreThreshold + }; + + _isoSettings = result; + return result; + } + + #endregion + + #region User-Accessible Parameters + + /// + /// Precision of encoding matrix + /// + public int PhaseRes { get; set; } + + /// + /// Minimum cosine similarity score for isotope distribution + /// + public float CssThreshold { get; set; } + + /// + /// Match Tolerance for peak detection in ppm + /// + public float MatchTolerance { get; set; } + + /// + /// Maximum shift allowed for isotope distribution + /// + public int MaxShift { get; set; } + + /// + /// MZ Window for isotope distribution + /// + public float[] MzWindow { get; set; } + + /// + /// Number of knockdown rounds + /// + public int KnockdownRounds { get; set; } + + /// + /// Minimum area covered by isotope distribution. Use in or with css_thresh + /// + public float MinAreaCovered { get; set; } + + /// + /// Threshold for data. Will remove relative intensities below this relative to max intensity in each cluster + /// + public float DataThreshold { get; set; } + + /// + /// Report multiple monoisotopic peaks + /// + public bool ReportMulitpleMonoisos { get; set; } + + #endregion User-Accessible Parameters + + #region Hard-Coded Parameters + + /// + /// Verbose output + /// + public int Verbose { get; protected set; } = 0; + + /// + /// Peak Detection Window + /// + public int PeakWindow { get; protected set; } = 80; + + /// + /// Peak Detection Threshold + /// + public float PeakThreshold { get; protected set; } = (float)0.0001; + + /// + /// Minimum Peaks for an allowed peak + /// + public int MinPeaks { get; protected set; } = 3; + + /// + /// Plus One Intensity range. Will be used for charge state 1 + /// + public float[] PlusOneIntWindow { get; protected set; } = new float[] { (float)0.1, (float)0.6 }; + + /// + /// Minimum score difference for isotope distribution to allow missed monoisotopic peaks + /// + public float MinScoreDiff { get; protected set; } = (float)0.1; + + /// + /// Isotope Distribution Length + /// + public int IsoLength { get; protected set; } = 64; + + /// + /// Mass difference between isotopes + /// + public double MassDiffC { get; protected set; } = 1.0033; + + /// + /// Adduct Mass + /// + public float AdductMass { get; protected set; } = (float)1.00727276467; + + /// + /// Use set the -1 isotope as 0 to help force better alignments + /// + public int MinusOneAreasZero { get; protected set; } = 1; + + /// + /// Threshold for isotope distribution. Will remove relative intensities below this. + /// + public float IsotopeThreshold { get; protected set; } = (float)0.01; + + /// + /// Ratio above which a secondary charge state prediction will be returned. + /// + public float ZScoreThreshold { get; protected set; } = (float)0.95; + + #endregion Hard-Coded Parameters + + public IsoDecDeconvolutionParameters( + Polarity polarity = Polarity.Positive, + int phaseRes = 8, + bool reportMultipleMonoisos = true, + float cssThreshold = (float)0.7, + float matchTolerance = (float)5, + int maxShift = 3, + float[] mzWindow = null, + int knockdownRounds = 5, + float minAreaCovered = (float)0.20, + float relativeDataThreshold = (float)0.05) + : base(1, 50, polarity) + { + PhaseRes = phaseRes; + ReportMulitpleMonoisos = reportMultipleMonoisos; + CssThreshold = cssThreshold; + MatchTolerance = matchTolerance; + MaxShift = maxShift; + MzWindow = mzWindow ?? new float[] { (float)-1.05, (float)2.05 }; + KnockdownRounds = knockdownRounds; + MinAreaCovered = minAreaCovered; + DataThreshold = relativeDataThreshold; + if (Polarity == Polarity.Negative) + AdductMass = (float)-1.00727276467; + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Enums/DissociationType.cs b/mzLib/MassSpectrometry/Enums/DissociationType.cs index 1ac136197..ca738b3fa 100644 --- a/mzLib/MassSpectrometry/Enums/DissociationType.cs +++ b/mzLib/MassSpectrometry/Enums/DissociationType.cs @@ -109,6 +109,11 @@ public enum DissociationType /// LowCID, + /// + /// activated ion electron photo detachment dissociation + /// + aEPD, + Unknown, AnyActivationType, Custom, diff --git a/mzLib/MassSpectrometry/MassSpectrometry.csproj b/mzLib/MassSpectrometry/MassSpectrometry.csproj index 6af63b9e4..e662d170d 100644 --- a/mzLib/MassSpectrometry/MassSpectrometry.csproj +++ b/mzLib/MassSpectrometry/MassSpectrometry.csproj @@ -20,4 +20,23 @@ + + + + + + + + Always + + + Always + + + Always + + + Always + + diff --git a/mzLib/MassSpectrometry/MsDataFile.cs b/mzLib/MassSpectrometry/MsDataFile.cs index 7242f2cf7..3e88fcfef 100644 --- a/mzLib/MassSpectrometry/MsDataFile.cs +++ b/mzLib/MassSpectrometry/MsDataFile.cs @@ -23,9 +23,6 @@ namespace MassSpectrometry { - // TODO: Define scope of class - // Class scope is to provide to the data loaded from the DataFile. - /// /// A class for interacting with data collected from a Mass Spectrometer, and stored in a file /// diff --git a/mzLib/MassSpectrometry/MsDataScan.cs b/mzLib/MassSpectrometry/MsDataScan.cs index 6e54e05e8..71cc16502 100644 --- a/mzLib/MassSpectrometry/MsDataScan.cs +++ b/mzLib/MassSpectrometry/MsDataScan.cs @@ -27,9 +27,29 @@ namespace MassSpectrometry { public class MsDataScan { - public MsDataScan(MzSpectrum massSpectrum, int oneBasedScanNumber, int msnOrder, bool isCentroid, Polarity polarity, double retentionTime, MzRange scanWindowRange, string scanFilter, MZAnalyzerType mzAnalyzer, - double totalIonCurrent, double? injectionTime, double[,] noiseData, string nativeId, double? selectedIonMz = null, int? selectedIonChargeStateGuess = null, double? selectedIonIntensity = null, double? isolationMZ = null, - double? isolationWidth = null, DissociationType? dissociationType = null, int? oneBasedPrecursorScanNumber = null, double? selectedIonMonoisotopicGuessMz = null, string hcdEnergy = null, string scanDescription = null) + public MsDataScan(MzSpectrum massSpectrum, + int oneBasedScanNumber, + int msnOrder, + bool isCentroid, + Polarity polarity, + double retentionTime, + MzRange scanWindowRange, + string scanFilter, + MZAnalyzerType mzAnalyzer, + double totalIonCurrent, + double? injectionTime, + double[,] noiseData, + string nativeId, + double? selectedIonMz = null, + int? selectedIonChargeStateGuess = null, + double? selectedIonIntensity = null, + double? isolationMZ = null, + double? isolationWidth = null, + DissociationType? dissociationType = null, + int? oneBasedPrecursorScanNumber = null, + double? selectedIonMonoisotopicGuessMz = null, + string hcdEnergy = null, + string scanDescription = null) { OneBasedScanNumber = oneBasedScanNumber; MsnOrder = msnOrder; @@ -61,7 +81,7 @@ public MsDataScan(MzSpectrum massSpectrum, int oneBasedScanNumber, int msnOrder, /// public MzSpectrum MassSpectrum { get; protected set; } - public int OneBasedScanNumber { get; private set; } + public int OneBasedScanNumber { get; protected set; } public int MsnOrder { get; } public double RetentionTime { get; } public Polarity Polarity { get; } @@ -70,7 +90,7 @@ public MsDataScan(MzSpectrum massSpectrum, int oneBasedScanNumber, int msnOrder, public string ScanFilter { get; } public string NativeId { get; private set; } public bool IsCentroid { get; } - public double TotalIonCurrent { get; } + public double TotalIonCurrent { get; protected set; } public double? InjectionTime { get; } public double[,] NoiseData { get; } @@ -82,7 +102,7 @@ public MsDataScan(MzSpectrum massSpectrum, int oneBasedScanNumber, int msnOrder, public double? SelectedIonMZ { get; private set; } // May be adjusted by calibration public DissociationType? DissociationType { get; } public double? IsolationWidth { get; } - public int? OneBasedPrecursorScanNumber { get; private set; } + public int? OneBasedPrecursorScanNumber { get; protected set; } public double? SelectedIonMonoisotopicGuessIntensity { get; private set; } // May be refined public double? SelectedIonMonoisotopicGuessMz { get; private set; } // May be refined public string HcdEnergy { get; private set; } diff --git a/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs b/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs index 7e42426b1..9fc5c05b1 100644 --- a/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs +++ b/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs @@ -14,29 +14,57 @@ public class IsotopicEnvelope : IHasMass /// /// Mass of most abundant observed isotopic peak, not accounting for addition or subtraction or protons due to ESI charge state induction /// - public double MostAbundantObservedIsotopicMass { get; private set; } + internal double MostAbundantObservedIsotopicMass { get; private set; } public readonly int Charge; public readonly double TotalIntensity; - public readonly double StDev; - public readonly int MassIndex; + public readonly int PrecursorId; public double Score { get; private set; } - public IsotopicEnvelope(List<(double mz, double intensity)> bestListOfPeaks, double bestMonoisotopicMass, int bestChargeState, double bestTotalIntensity, double bestStDev, int bestMassIndex) + /// + /// Used for an isotopic envelope that mzLib deconvoluted (e.g., from a mass spectrum) + /// + public IsotopicEnvelope(List<(double mz, double intensity)> bestListOfPeaks, double bestMonoisotopicMass, int bestChargeState, double bestTotalIntensity, double bestStDev) { Peaks = bestListOfPeaks; MonoisotopicMass = bestMonoisotopicMass; - MostAbundantObservedIsotopicMass = GetMostAbundantObservedIsotopicMass(bestListOfPeaks, bestChargeState); + MostAbundantObservedIsotopicMass = bestListOfPeaks.MaxBy(p => p.intensity).mz * Math.Abs(bestChargeState); Charge = bestChargeState; TotalIntensity = bestTotalIntensity; - StDev = bestStDev; - MassIndex = bestMassIndex; - Score = ScoreIsotopeEnvelope(); + Score = ScoreIsotopeEnvelope(bestStDev); + } + + /// + /// Used for a neutral mass read in from a deconvoluted file + /// Assumes the mass is correct: score is max value + /// + public IsotopicEnvelope(double monoisotopicMass, double intensity, int charge) + { + MonoisotopicMass = monoisotopicMass; + Charge = charge; + TotalIntensity = intensity; + Score = double.MaxValue; + Peaks = [(monoisotopicMass.ToMz(charge), intensity)]; } - public double GetMostAbundantObservedIsotopicMass(List<(double mz, double intensity)> peaks, int charge) + /// + /// Used for A deconvolution method that calculates its own score. + /// + /// All missed mono products of the same peak will share an ID if enabled in IsoDec + /// + /// + /// + /// + /// + public IsotopicEnvelope(int id, List<(double mz, double intensity)> peaks, double monoisotopicmass, int chargestate, double intensity, double score) { - return peaks.MaxBy(p => p.intensity).mz * Math.Abs(charge); + PrecursorId = id; + Peaks = peaks; + MonoisotopicMass = monoisotopicmass; + Charge = chargestate; + TotalIntensity = intensity; + Score = score; + MostAbundantObservedIsotopicMass = peaks.MaxBy(p => p.intensity).mz * Math.Abs(chargestate); } public override string ToString() @@ -44,10 +72,10 @@ public override string ToString() return Charge + "\t" + Peaks[0].mz.ToString("G8") + "\t" + Peaks.Count + "\t" + TotalIntensity; } - private double ScoreIsotopeEnvelope() //likely created by Stefan Solntsev using peptide data + private double ScoreIsotopeEnvelope(double stDev) //likely created by Stefan Solntsev using peptide data { return Peaks.Count >= 2 ? - TotalIntensity / Math.Pow(StDev, 0.13) * Math.Pow(Peaks.Count, 0.4) / Math.Pow(Math.Abs(Charge), 0.06) : + TotalIntensity / Math.Pow(stDev, 0.13) * Math.Pow(Peaks.Count, 0.4) / Math.Pow(Math.Abs(Charge), 0.06) : 0; } @@ -60,6 +88,5 @@ public void SetMedianMonoisotopicMass(List monoisotopicMassPredictions) { MonoisotopicMass = monoisotopicMassPredictions.Median(); } - } } \ No newline at end of file diff --git a/mzLib/MassSpectrometry/MzSpectra/MzSpectrum.cs b/mzLib/MassSpectrometry/MzSpectra/MzSpectrum.cs index 2e9fcc7a4..62422cc96 100644 --- a/mzLib/MassSpectrometry/MzSpectra/MzSpectrum.cs +++ b/mzLib/MassSpectrometry/MzSpectra/MzSpectrum.cs @@ -22,10 +22,8 @@ using System; using System.Collections; using System.Collections.Generic; -using System.Diagnostics; using System.IO; using System.Linq; -using System.Text.Json; namespace MassSpectrometry { @@ -126,7 +124,7 @@ public MzRange Range } } - public double? FirstX + public virtual double? FirstX { get { @@ -138,7 +136,7 @@ public double? FirstX } } - public double? LastX + public virtual double? LastX { get { @@ -373,7 +371,7 @@ public IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateForM } } - return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, Statistics.StandardDeviation(listOfRatios), massIndex); + return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, listOfRatios.StandardDeviation()); } [Obsolete("Deconvolution Has been moved to the Deconvoluter Object")] @@ -611,6 +609,26 @@ public int GetClosestPeakIndex(double x) return XArray.GetClosestIndex(x); } + public List GetPeakIndicesWithinTolerance(double x, Tolerance tolerance) + { + if (XArray.Length == 0) + return []; + + // find min and max allowed + var minX = tolerance.GetMinimumValue(x); + var maxX = tolerance.GetMaximumValue(x); + + // check if min and max are possible to find in this spectrum + if (XArray.First() > maxX || XArray.Last() < minX) + return []; + + // find index closest to extrema + int startingIndex = XArray.GetClosestIndex(minX, ArraySearchOption.Next); + int endIndex = XArray.GetClosestIndex(maxX, ArraySearchOption.Previous); + + return Enumerable.Range(startingIndex, endIndex - startingIndex + 1).ToList(); + } + public void ReplaceXbyApplyingFunction(Func convertor) { for (int i = 0; i < Size; i++) @@ -796,7 +814,12 @@ private MzPeak GetPeak(int index) return peakList[index]; } - private MzPeak GeneratePeak(int index) + /// + /// The source of all peaks which populate the peakList + /// + /// + /// + protected virtual MzPeak GeneratePeak(int index) { return new MzPeak(XArray[index], YArray[index]); } diff --git a/mzLib/MassSpectrometry/MzSpectra/NeutralMassSpectrum.cs b/mzLib/MassSpectrometry/MzSpectra/NeutralMassSpectrum.cs new file mode 100644 index 000000000..dcb5d7d2b --- /dev/null +++ b/mzLib/MassSpectrometry/MzSpectra/NeutralMassSpectrum.cs @@ -0,0 +1,65 @@ +using System; +using Chemistry; + +namespace MassSpectrometry +{ + public class NeutralMassSpectrum : MzSpectrum + { + public int[] Charges { get; init; } + public NeutralMassSpectrum(double[,] monoisotopicMassesIntensities, int[] charges) : base(monoisotopicMassesIntensities) + { + if (monoisotopicMassesIntensities.GetLength(0) != charges.Length) + throw new ArgumentException("The lengths of monoisotopicMasses, intensities, and charges must be the same."); + + Charges = charges; + + double minMz = double.MaxValue; + double maxMz = double.MinValue; + for (int i = 0; i < monoisotopicMassesIntensities.GetLength(0); i++) + { + var mz = monoisotopicMassesIntensities[i,0].ToMz(charges[i]); + if (mz < minMz) + minMz = mz; + if (mz > maxMz) + maxMz = mz; + } + + FirstX = minMz; + LastX = maxMz; + } + + public NeutralMassSpectrum(double[] monoisotopicMasses, double[] intensities, int[] charges, bool shouldCopy) + : base(monoisotopicMasses, intensities, shouldCopy) + { + if (monoisotopicMasses.GetLength(0) != intensities.Length || monoisotopicMasses.Length != charges.Length) + throw new ArgumentException("The lengths of monoisotopicMasses, intensities, and charges must be the same."); + + Charges = charges; + + double minMz = double.MaxValue; + double maxMz = double.MinValue; + for (int i = 0; i < monoisotopicMasses.Length; i++) + { + var mz = monoisotopicMasses[i].ToMz(charges[i]); + if (mz < minMz) + minMz = mz; + if (mz > maxMz) + maxMz = mz; + } + + FirstX = minMz; + LastX = maxMz; + } + + public override double? FirstX { get; } // in m/z + public override double? LastX { get; } // in m/z + + /// + /// Converts to a charged spectrum + /// + protected override MzPeak GeneratePeak(int index) + { + return new MzPeak(XArray[index].ToMz(Charges[index]), YArray[index]); + } + } +} diff --git a/mzLib/MassSpectrometry/isodeclib.dll b/mzLib/MassSpectrometry/isodeclib.dll new file mode 100644 index 000000000..a89f4ad96 Binary files /dev/null and b/mzLib/MassSpectrometry/isodeclib.dll differ diff --git a/mzLib/MassSpectrometry/isogenmass.dll b/mzLib/MassSpectrometry/isogenmass.dll new file mode 100644 index 000000000..072d3f72d Binary files /dev/null and b/mzLib/MassSpectrometry/isogenmass.dll differ diff --git a/mzLib/MassSpectrometry/libmmd.dll b/mzLib/MassSpectrometry/libmmd.dll new file mode 100644 index 000000000..e8544358b Binary files /dev/null and b/mzLib/MassSpectrometry/libmmd.dll differ diff --git a/mzLib/MassSpectrometry/svml_dispmd.dll b/mzLib/MassSpectrometry/svml_dispmd.dll new file mode 100644 index 000000000..147636b82 Binary files /dev/null and b/mzLib/MassSpectrometry/svml_dispmd.dll differ diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 0129154a4..f725822f7 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -19,12 +19,93 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Runtime.CompilerServices; using System.Text.RegularExpressions; namespace MzLibUtil { public static class ClassExtensions { + /// + /// Parses the full sequence to identify mods. + /// + /// Full sequence of the peptide in question + /// If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing). + /// If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid. + /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod + public static Dictionary> ParseModifications(this string fullSequence, bool modOnNTerminus=false, bool modOnCTerminus=false) + { + // use a regex to get all modifications + string pattern = @"\[(.+?)\](?> modDict = new(); + + string fullSeq = fullSequence; + RemoveSpecialCharacters(ref fullSeq); + MatchCollection matches = regex.Matches(fullSeq); + int captureLengthSum = 0; + foreach (Match match in matches) + { + GroupCollection group = match.Groups; + string val = group[1].Value; + int startIndex = group[0].Index; + int captureLength = group[0].Length; + + List modList = new List(); + modList.Add(val); + + // The position of the amino acids is tracked by the positionToAddToDict variable. It takes the + // startIndex of the modification Match and removes the cumulative length of the modifications + // found (including the brackets). The difference will be the number of nonmodification characters, + // or the number of amino acids prior to the startIndex in the sequence. + int positionToAddToDict = startIndex - captureLengthSum; + + // Handle N terminus indexing + if ((positionToAddToDict == 0) && !modOnNTerminus) + { + positionToAddToDict++; + } + + // Handle C terminus indexing + if ((fullSeq.Length == startIndex + captureLength) && modOnCTerminus) + { + positionToAddToDict++; + } + + // check to see if key already exist + // if the already key exists, update the current position with the capture length + 1. + // otherwise, add the modification to the dict. + if (modDict.ContainsKey(positionToAddToDict)) + { + modDict[positionToAddToDict].Add(val); + } + else + { + modDict.Add(positionToAddToDict, modList); + } + captureLengthSum += captureLength; + } + return modDict; + } + + /// + /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. + /// + /// + /// + /// + /// + public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|") + { + // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K) + Regex regexSpecialChar = new(specialCharacter); + fullSequence = regexSpecialChar.Replace(fullSequence, replacement); + } + public static double[] BoxCarSmooth(this double[] data, int points) { // Force to be odd @@ -57,6 +138,18 @@ public static T[] SubArray(this T[] data, int index, int length) return result; } + public static bool ToEnum(this int modeInt, out T result) where T : Enum + { + Type enumType = typeof(T); + if (!Enum.IsDefined(enumType, modeInt)) + { + result = default(T); + return false; + } + result = (T)Enum.ToObject(enumType, modeInt); + return true; + } + /// /// Checks if two collections are equivalent, regardless of the order of their contents /// diff --git a/mzLib/MzLibUtil/MzLibException.cs b/mzLib/MzLibUtil/MzLibException.cs index cf86074d8..885081433 100644 --- a/mzLib/MzLibUtil/MzLibException.cs +++ b/mzLib/MzLibUtil/MzLibException.cs @@ -1,13 +1,9 @@ -using System; +#nullable enable +using System; namespace MzLibUtil { [Serializable] - public class MzLibException : Exception - { - public MzLibException(string message) - : base(message) - { - } - } + public class MzLibException(string message, Exception? innerException = null) + : Exception(message, innerException); } \ No newline at end of file diff --git a/mzLib/MzLibUtil/MzLibUtil.csproj b/mzLib/MzLibUtil/MzLibUtil.csproj index c6b5cf526..60e7fb93f 100644 --- a/mzLib/MzLibUtil/MzLibUtil.csproj +++ b/mzLib/MzLibUtil/MzLibUtil.csproj @@ -11,10 +11,9 @@ - + - diff --git a/mzLib/MzLibUtil/MzLibUtil.csproj.DotSettings b/mzLib/MzLibUtil/MzLibUtil.csproj.DotSettings new file mode 100644 index 000000000..52f9c2892 --- /dev/null +++ b/mzLib/MzLibUtil/MzLibUtil.csproj.DotSettings @@ -0,0 +1,2 @@ + + True \ No newline at end of file diff --git a/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs b/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs new file mode 100644 index 000000000..0e3efec5c --- /dev/null +++ b/mzLib/MzLibUtil/ObjectPools/DictionaryPool.cs @@ -0,0 +1,94 @@ +using System; +using System.Collections.Generic; +using Microsoft.Extensions.ObjectPool; + +namespace MzLibUtil; + +// Example Usage: +// var pool = new DictionaryPool(); +// var dictionary = pool.Get(); +// try { +// dictionary.Add(1,1); +// Do Work +// } +// finally { +// pool.Return(dictionary); +// } + +/// +/// Provides a pool for instances to reduce memory allocations. +/// This class uses the from Microsoft.Extensions.ObjectPool +/// to manage the pooling of objects. +/// +/// The type of keys in the . +/// The type of values in the . +/// +/// This class is not thread-safe and should not be shared between threads. +/// This class should be pulled from outside a try finally loop and finally should return the Dictionary to the pool to ensure proper pooling in the case of a caught exception. +/// +public class DictionaryPool where TKey : notnull +{ + private readonly ObjectPool> _pool; + + /// + /// Initializes a new instance of the class. + /// + /// Initial capacity for the pooled Dictionary instances. + public DictionaryPool(int initialCapacity = 16) + { + var policy = new DictionaryPooledObjectPolicy(initialCapacity); + var provider = new DefaultObjectPoolProvider { MaximumRetained = Environment.ProcessorCount * 2 }; + _pool = provider.Create(policy); + } + + /// + /// Retrieves a Dictionary instance from the pool. + /// + /// A Dictionary instance. + public Dictionary Get() => _pool.Get(); + + /// + /// Returns a Dictionary instance back to the pool. + /// + /// The Dictionary instance to return. + public void Return(Dictionary dictionary) + { + if (dictionary == null) throw new ArgumentNullException(nameof(dictionary)); + dictionary.Clear(); // Ensure the Dictionary is clean before returning it to the pool + _pool.Return(dictionary); + } + + /// + /// Policy for pooling Dictionary instances with a specified initial capacity. + /// + /// The type of keys in the Dictionary. + /// The type of values in the Dictionary. + /// The initial capacity for the pooled Dictionary instances. + private class DictionaryPooledObjectPolicy(int initialCapacity) + : PooledObjectPolicy> + where TKeyItem : notnull + { + private int InitialCapacity { get; } = initialCapacity; + + /// + /// Creates a new Dictionary instance with the specified initial capacity. + /// + /// A new Dictionary instance. + public override Dictionary Create() + { + return new Dictionary(capacity: InitialCapacity); + } + + /// + /// Returns a Dictionary instance to the pool after clearing it. + /// + /// The Dictionary instance to return. + /// True if the Dictionary instance can be reused; otherwise, false. + public override bool Return(Dictionary obj) + { + // Ensure the Dictionary can be safely reused + obj.Clear(); + return true; + } + } +} diff --git a/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs b/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs new file mode 100644 index 000000000..ae33ffd99 --- /dev/null +++ b/mzLib/MzLibUtil/ObjectPools/HashSetPool.cs @@ -0,0 +1,88 @@ +using System; +using System.Collections.Generic; +using Microsoft.Extensions.ObjectPool; + +namespace MzLibUtil; + + +// Example Usage: +// var pool = new HashSetPool(); +// var hashSet = pool.Get(); +// try { +// hashSet.Add(1); +// Do Work +// } +// finally { +// pool.Return(hashSet); +// } + +/// +/// Provides a pool for instances to reduce memory allocations. +/// This class uses the from Microsoft.Extensions.ObjectPool +/// to manage the pooling of objects. +/// +/// The type of elements in the . +/// +/// This class is not thread-safe and should not be shared between threads. +/// This class should be pulled from outside a try finally loop and finally should return the HashSet to the pool to ensure proper pooling in the case of a caught exception +/// See example found in DigestionAgent.GetDigestionSiteIndices() for proper usage +/// +public class HashSetPool +{ + private readonly ObjectPool> _pool; + + /// + /// Initializes a new instance of the class. + /// + /// Initial capacity for the pooled HashSet instances. + public HashSetPool(int initialCapacity = 16) + { + var policy = new HashSetPooledObjectPolicy(initialCapacity); + _pool = new DefaultObjectPool>(policy); + } + + /// + /// Retrieves a instance from the pool. + /// + /// A instance. + public HashSet Get() => _pool.Get(); + + /// + /// Returns a instance back to the pool. + /// + /// The instance to return. + public void Return(HashSet hashSet) + { + if (hashSet == null) throw new ArgumentNullException(nameof(hashSet)); + hashSet.Clear(); // Ensure the HashSet is clean before returning it to the pool + _pool.Return(hashSet); + } + + /// + /// Defines the policy for creating and returning instances to the pool. + /// + /// The type of elements in the . + private class HashSetPooledObjectPolicy(int initialCapacity) : PooledObjectPolicy> + { + /// + /// Creates a new instance with the specified initial capacity. + /// + /// A new instance. + public override HashSet Create() + { + return new HashSet(capacity: initialCapacity); + } + + /// + /// Returns a instance to the pool after clearing it. + /// + /// The instance to return. + /// Always returns true. + public override bool Return(HashSet obj) + { + // Ensure the HashSet can be safely reused + obj.Clear(); + return true; + } + } +} diff --git a/mzLib/MzLibUtil/ObjectPools/ListPool.cs b/mzLib/MzLibUtil/ObjectPools/ListPool.cs new file mode 100644 index 000000000..9f25a7926 --- /dev/null +++ b/mzLib/MzLibUtil/ObjectPools/ListPool.cs @@ -0,0 +1,90 @@ +using System; +using System.Collections.Generic; +using Microsoft.Extensions.ObjectPool; + +namespace MzLibUtil; + +// Example Usage: +// var pool = new ListPool(); +// var list = pool.Get(); +// try { +// list.Add(1); +// Do Work +// } +// finally { +// pool.Return(list); +// } + +/// +/// Provides a pool for instances to reduce memory allocations. +/// This class uses the from Microsoft.Extensions.ObjectPool +/// to manage the pooling of objects. +/// +/// The type of elements in the . +/// +/// This class is not thread-safe and should not be shared between threads. +/// This class should be pulled from outside a try finally loop and finally should return the List to the pool to ensure proper pooling in the case of a caught exception. +/// +public class ListPool +{ + private readonly ObjectPool> _pool; + + /// + /// Initializes a new instance of the class. + /// + /// Initial capacity for the pooled HashSet instances. + public ListPool(int initialCapacity = 16) + { + var policy = new ListPooledObjectPolicy(initialCapacity); + var provider = new DefaultObjectPoolProvider { MaximumRetained = Environment.ProcessorCount * 2 }; + _pool = provider.Create(policy); + } + + /// + /// Retrieves a HashSet instance from the pool. + /// + /// A HashSet instance. + public List Get() => _pool.Get(); + + /// + /// Returns a HashSet instance back to the pool. + /// + /// The HashSet instance to return. + public void Return(List list) + { + if (list == null) throw new ArgumentNullException(nameof(list)); + list.Clear(); // Ensure the HashSet is clean before returning it to the pool + _pool.Return(list); + } + + /// + /// Policy for pooling List instances with a specified initial capacity. + /// + /// The type of elements in the List. + /// The initial capacity for the pooled List instances. + private class ListPooledObjectPolicy(int initialCapacity) : PooledObjectPolicy> + { + private int InitialCapacity { get; } = initialCapacity; + + /// + /// Creates a new List instance with the specified initial capacity. + /// + /// A new List instance. + public override List Create() + { + return new List(capacity: InitialCapacity); + } + + /// + /// Resets the List instance to a clean state before returning it to the pool. + /// + /// The List instance to reset and return. + /// True if the List instance can be returned to the pool; otherwise, false. + public override bool Return(List obj) + { + // Ensure the List can be safely reused + obj.Clear(); + return true; + } + } +} \ No newline at end of file diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs new file mode 100644 index 000000000..cdb7d33fb --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -0,0 +1,203 @@ +using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; +using Easy.Common.Extensions; + +namespace MzLibUtil +{ + // Should this have all of the parent data (i.e. protein group, protein, peptide, peptide position)? Unnecessary for now, but probably useful later. + public class UtilModification + { + public string IdWithMotif { get; set; } + public int PeptidePositionZeroIsNTerminus { get; set; } //NEED TO ENFORCE THIS EVERYWHERE OR CHECK IF ZERO OR ONE + + + public double Intensity { get; set; } + + public UtilModification(string name, int position, double intensity) + { + IdWithMotif = name; + PeptidePositionZeroIsNTerminus = position; + Intensity = intensity; + } + + } + public class UtilPeptide + { + public string FullSequence { get; set; } + public string BaseSequence { get; set; } + public UtilProtein ParentProtein { get; set; } + public int IndexInProtein { get; set; } + public Dictionary> ModifiedAminoAcidPositions { get; set; } + public double Intensity { get; set; } + + public UtilPeptide(string fullSequence, Dictionary> mods = null) + { + FullSequence = fullSequence; + ModifiedAminoAcidPositions = mods.IsNotNullOrEmpty() ? mods : new Dictionary>(); + SetBaseSequence(); + } + public void SetBaseSequence(string modPattern = @"\[(.+?)\](? mods) + { + throw new NotImplementedException(); + } + public void PeptideToProteinPositions(int offset=0) + { + offset = offset != 0 ? offset : ParentProtein.Sequence.IndexOf(BaseSequence); + foreach (var modpos in ModifiedAminoAcidPositions.Keys) + { + int positionInProtein = modpos + offset; + Dictionary mods = ModifiedAminoAcidPositions[modpos]; + foreach (var mod in mods.Values) + { + mod.PeptidePositionZeroIsNTerminus = positionInProtein; + } + ModifiedAminoAcidPositions.Add(positionInProtein, mods); + ModifiedAminoAcidPositions.Remove(modpos); + } + } + } + + public class UtilProtein + { + public string Name { get; set; } + public string Sequence { get; set; } + public Dictionary Peptides { get; set; } + + public UtilProtein(string name, Dictionary peptides=null) + { + Name = name; + if (peptides != null) Peptides = peptides; + else Peptides= new Dictionary(); + } + } + + public class UtilProteinGroup + { + public string Name { get; set;} + public Dictionary Proteins { get; set; } + + public UtilProteinGroup(string name, Dictionary proteins = null) + { + Name = name; + if (proteins != null) Proteins = proteins; + else Proteins= new Dictionary(); + } + } + public class PositionFrequencyAnalysis + { + /// + /// Calculates the occupancy of post-translational modifications at the peptide level. + /// + /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. + /// If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing). + /// If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid. + /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity + /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for + /// all of the amino acids in that peptide. + /// + + public Dictionary Occupancy { get; private set; } + public string OccupancyLevel { get; private set; } + + + public void PeptidePTMOccupancy(List, double>> peptides, bool modOnNTerminus = true, bool modOnCTerminus = true) + { + var proteinGroups = new Dictionary(); + + // Go through the peptides given + foreach (var pep in peptides) + { + string fullSeq = pep.Item1; + string baseSeq = pep.Item2.IsNotNullOrEmpty() ? pep.Item2 : new string(pep.Item1.ToCharArray()); + ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", @"\[(.+?)\](? pgs = pep.Item3; + double peptideIntensity = pep.Item4; + + // Go through the peptide's protein groups + foreach (var pgName in pgs) + { + // If have not seen that protein group, store it + if (!proteinGroups.ContainsKey(pgName)) + { + proteinGroups[pgName] = new UtilProteinGroup(pgName); + } + var proteinGroup = proteinGroups[pgName]; + + // Go through the proteins in each protein group + foreach (var proteinName in pgName.Split('|')) + { + // Add the protein to the protein group's dictionary if it has not been added + if (!proteinGroup.Proteins.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName] = new UtilProtein(proteinName); + } + var protein = proteinGroup.Proteins[proteinName]; + + // If the peptide's base sequence has not been seen, add it to the protein's dictionary + if (!protein.Peptides.ContainsKey(baseSeq)) + { + protein.Peptides[baseSeq] = new UtilPeptide(fullSeq); + protein.Peptides[baseSeq].Intensity = 0; + } + + // Increase the total intensity of the peptide base sequence to track the total intensity of all amino acids in that sequence + protein.Peptides[baseSeq].Intensity += peptideIntensity; + var peptide = protein.Peptides[baseSeq]; + + // Want both arguments passed here to be true if need to later filter out peptide terminal mods that are not protein terminal mods + Dictionary> peptideMods = fullSeq.ParseModifications(modOnNTerminus, modOnCTerminus); + // Go through the modified positions found froum the full sequence + foreach (var modpos in peptideMods) + { + // If that position has not been recorded as containing a modification, add it to the base sequence's dictonary + if (!peptide.ModifiedAminoAcidPositions.ContainsKey(modpos.Key)) + { + peptide.ModifiedAminoAcidPositions[modpos.Key] = new Dictionary(); + } + var modifiedPosition = peptide.ModifiedAminoAcidPositions[modpos.Key]; + + // Go through the modifications found at a modified amino acid index + foreach (var mod in modpos.Value) + { + //If the name of that modification has not been seen, record that modification in the index's dictionary with an intensity of 0 + if (!modifiedPosition.ContainsKey(mod)) + { + modifiedPosition[mod] = new UtilModification(mod, modpos.Key, 0); + } + // Increase the intensity of the modification by the intensity of the peptide + modifiedPosition[mod].Intensity += peptideIntensity; + } + } + } + } + } + Occupancy = proteinGroups; + OccupancyLevel = "peptide"; + } + + public void PeptideToProteinPTMOccupancy(Dictionary proteinSequences) // combine this to previous method. + { + foreach (var pg in Occupancy.Keys) + { + UtilProteinGroup proteinGroup = Occupancy[pg]; + foreach (var prot in proteinGroup.Proteins.Keys) + { + UtilProtein protein = proteinGroup.Proteins[prot]; + foreach (var pep in protein.Peptides.Keys) + { + UtilPeptide peptide = protein.Peptides[pep]; + peptide.ParentProtein = protein; + peptide.PeptideToProteinPositions(); + } + } + } + OccupancyLevel = "protein"; + } + } +} diff --git a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs index 2e5d29718..20d0e7abe 100644 --- a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs +++ b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs @@ -18,9 +18,9 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS var subsequence = new StringBuilder(); // modification on peptide N-terminus - if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) + if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification? mod)) { - subsequence.Append('[' + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } for (int r = 0; r < withSetMods.Length; r++) @@ -32,11 +32,11 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS { if (mod.MonoisotopicMass > 0) { - subsequence.Append("[+" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[+{mod.MonoisotopicMass.RoundedDouble(6)}]"); } else { - subsequence.Append("[" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } } } @@ -46,11 +46,11 @@ public static string FullSequenceWithMassShift(this IBioPolymerWithSetMods withS { if (mod.MonoisotopicMass > 0) { - subsequence.Append("[+" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[+{mod.MonoisotopicMass.RoundedDouble(6)}]"); } else { - subsequence.Append("[" + mod.MonoisotopicMass.RoundedDouble(6).ToString() + ']'); + subsequence.Append($"[{mod.MonoisotopicMass.RoundedDouble(6)}]"); } } return subsequence.ToString(); @@ -68,14 +68,15 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, string essentialSequence = withSetMods.BaseSequence; if (modstoWritePruned != null) { - var sbsequence = new StringBuilder(); + var sbsequence = new StringBuilder(withSetMods.FullSequence.Length); // variable modification on peptide N-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification pep_n_term_variable_mod)) { if (modstoWritePruned.ContainsKey(pep_n_term_variable_mod.ModificationType)) { - sbsequence.Append('[' + pep_n_term_variable_mod.ModificationType + ":" + pep_n_term_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{pep_n_term_variable_mod.ModificationType}:{pep_n_term_variable_mod.IdWithMotif}]"); } } for (int r = 0; r < withSetMods.Length; r++) @@ -86,7 +87,8 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, { if (modstoWritePruned.ContainsKey(residue_variable_mod.ModificationType)) { - sbsequence.Append('[' + residue_variable_mod.ModificationType + ":" + residue_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{residue_variable_mod.ModificationType}:{residue_variable_mod.IdWithMotif}]"); } } } @@ -96,7 +98,8 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, { if (modstoWritePruned.ContainsKey(pep_c_term_variable_mod.ModificationType)) { - sbsequence.Append('[' + pep_c_term_variable_mod.ModificationType + ":" + pep_c_term_variable_mod.IdWithMotif + ']'); + sbsequence.Append( + $"[{pep_c_term_variable_mod.ModificationType}:{pep_c_term_variable_mod.IdWithMotif}]"); } } @@ -112,12 +115,13 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, /// public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMods) { - var subSequence = new StringBuilder(); + // start string builder with initial capacity to avoid resizing costs. + var subSequence = new StringBuilder(withSetMods.BaseSequence.Length + withSetMods.AllModsOneIsNterminus.Count * 30); // modification on peptide N-terminus - if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) + if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification? mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } for (int r = 0; r < withSetMods.Length; r++) @@ -127,14 +131,14 @@ public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMo // modification on this residue if (withSetMods.AllModsOneIsNterminus.TryGetValue(r + 2, out mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } } // modification on peptide C-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(withSetMods.Length + 2, out mod)) { - subSequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + subSequence.Append($"[{mod.ModificationType}:{mod.IdWithMotif}]"); } return subSequence.ToString(); diff --git a/mzLib/Omics/Digestion/DigestionAgent.cs b/mzLib/Omics/Digestion/DigestionAgent.cs index d860659b9..734d5e65f 100644 --- a/mzLib/Omics/Digestion/DigestionAgent.cs +++ b/mzLib/Omics/Digestion/DigestionAgent.cs @@ -1,14 +1,12 @@ -using Omics.Modifications; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; +using MzLibUtil; +using Omics.Modifications; namespace Omics.Digestion { public abstract class DigestionAgent { + protected static readonly HashSetPool HashSetPool = new HashSetPool(8); + protected DigestionAgent(string name, CleavageSpecificity cleavageSpecificity, List motifList, Modification cleavageMod) { Name = name; @@ -17,7 +15,7 @@ protected DigestionAgent(string name, CleavageSpecificity cleavageSpecificity, L CleavageMod = cleavageMod; } - public string Name { get; init; } + public readonly string Name; public CleavageSpecificity CleavageSpecificity { get; init; } public List DigestionMotifs { get; init; } public Modification CleavageMod { get; set; } @@ -27,6 +25,16 @@ public override string ToString() return Name; } + public override bool Equals(object? obj) + { + return obj is DigestionAgent agent && agent.Name == Name; + } + + public override int GetHashCode() + { + return Name.GetHashCode(); + } + /// /// Is length of given peptide okay, given minimum and maximum? /// @@ -68,40 +76,48 @@ protected static bool ValidMaxLength(int? length, int maxLength) /// public List GetDigestionSiteIndices(string sequence) { - var indices = new List(); - - for (int r = 0; r < sequence.Length; r++) + var indices = HashSetPool.Get(); // use hash set to ensure no duplicates + try // Try block is to ensure that, even if an error gets thrown, the hashset is returned to the pool { - var cutSiteIndex = -1; - bool cleavagePrevented = false; + indices.Add(0); // The start of the protein is treated as a cleavage site to retain the n-terminal peptide - foreach (DigestionMotif motif in DigestionMotifs) + for (int r = 0; r < sequence.Length; r++) { - var motifResults = motif.Fits(sequence, r); - bool motifFits = motifResults.Item1; - bool motifPreventsCleavage = motifResults.Item2; + var cutSiteIndex = -1; + bool cleavagePrevented = false; - if (motifFits && r + motif.CutIndex < sequence.Length) + foreach (DigestionMotif motif in DigestionMotifs) { - cutSiteIndex = Math.Max(r + motif.CutIndex, cutSiteIndex); + var motifResults = motif.Fits(sequence, r); + bool motifFits = motifResults.Item1; + bool motifPreventsCleavage = motifResults.Item2; + + if (motifFits && r + motif.CutIndex < sequence.Length) + { + cutSiteIndex = Math.Max(r + motif.CutIndex, cutSiteIndex); + } + + if (motifPreventsCleavage) // if any motif prevents cleave + { + cleavagePrevented = true; + } } - if (motifPreventsCleavage) // if any motif prevents cleave + // if no motif prevents cleave + if (!cleavagePrevented && cutSiteIndex != -1) { - cleavagePrevented = true; + indices.Add(cutSiteIndex); } } - // if no motif prevents cleave - if (!cleavagePrevented && cutSiteIndex != -1) - { - indices.Add(cutSiteIndex); - } + indices.Add(sequence.Length); // The end of the protein is treated as a cleavage site to retain the c-terminal peptide + return indices.ToList(); // convert the hashset to a list for return. + } + finally + { + // return hashset to pool. This clears it and gets it ready for the next time it is needed from the pool. + HashSetPool.Return(indices); } - - indices.Add(0); // The start of the protein is treated as a cleavage site to retain the n-terminal peptide - indices.Add(sequence.Length); // The end of the protein is treated as a cleavage site to retain the c-terminal peptide - return indices.Distinct().OrderBy(i => i).ToList(); } } } diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index 55aed3255..b43980e23 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -1,15 +1,13 @@ -using System; -using System.Collections.Generic; -using System.ComponentModel; -using System.Linq; -using System.Text; -using System.Threading.Tasks; +using MzLibUtil; using Omics.Modifications; namespace Omics.Digestion { public abstract class DigestionProduct { + protected static readonly DictionaryPool> DictionaryPool = new(); + protected static readonly DictionaryPool FixedModDictionaryPool = new(8); + protected string _baseSequence; protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int oneBasedEndResidue, int missedCleavages, @@ -41,33 +39,66 @@ protected DigestionProduct(IBioPolymer parent, int oneBasedStartResidue, int one public int Length => BaseSequence.Length; //how many residues long the peptide is public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex]; - protected static IEnumerable> GetVariableModificationPatterns(Dictionary> possibleVariableModifications, int maxModsForPeptide, int peptideLength) + #region Digestion Helper Methods + + /// + /// Generates all possible variable modification patterns for a peptide, which includes variable and localized modifications but excludes fixed mods + /// + /// A dictionary of possible variable modifications with their positions. + /// The maximum number of modifications allowed for the peptide. + /// The length of the peptide. + /// An enumerable of dictionaries representing different modification patterns. + /// + /// This method generates all possible combinations of variable modifications for a given peptide. + /// It first calculates the total number of available modifications and the maximum number of variable modifications allowed. + /// Then, it iterates through all possible numbers of modifications and generates the corresponding modification patterns. + /// The returned dictionary is then appended with fixed modifications and used to construct a peptide with set mods + /// + protected static IEnumerable> GetVariableModificationPatterns(Dictionary> possibleVariableModifications, int maxModsForPeptide, int peptideLength) { - if (possibleVariableModifications.Count == 0) - { - yield return null; - } - else - { - var possible_variable_modifications = new Dictionary>(possibleVariableModifications); + if (possibleVariableModifications.Count <= 0) + yield break; + + int[] baseVariableModificationPattern = new int[peptideLength + 4]; + int totalAvailableMods = possibleVariableModifications.Values.Sum(modList => modList?.Count ?? 0); + int maxVariableMods = Math.Min(totalAvailableMods, maxModsForPeptide); + var variableModKvpList = possibleVariableModifications.ToList(); - int[] base_variable_modification_pattern = new int[peptideLength + 4]; - var totalAvailableMods = possible_variable_modifications.Sum(b => b.Value == null ? 0 : b.Value.Count); - for (int variable_modifications = 0; variable_modifications <= Math.Min(totalAvailableMods, maxModsForPeptide); variable_modifications++) + for (int variable_modifications = 0; variable_modifications <= maxVariableMods; variable_modifications++) + { + foreach (int[] variable_modification_pattern in GetVariableModificationPatternsRecursive(variableModKvpList, + possibleVariableModifications.Count - variable_modifications, baseVariableModificationPattern, 0)) { - foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(new List>>(possible_variable_modifications), - possible_variable_modifications.Count - variable_modifications, base_variable_modification_pattern, 0)) + // use modification pattern to construct a dictionary of modifications for the peptide + var modificationPattern = new Dictionary(possibleVariableModifications.Count); + + foreach (var variableModSet in possibleVariableModifications) { - yield return GetNewVariableModificationPattern(variable_modification_pattern, possible_variable_modifications); + int modIndex = variable_modification_pattern[variableModSet.Key] - 1; + if (modIndex >= 0) + { + modificationPattern.Add(variableModSet.Key, variableModSet.Value.ElementAt(modIndex)); + } } + + yield return modificationPattern; } } } - protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(int length, - IEnumerable allKnownFixedModifications) + /// + /// Sets the fixed modifications for the peptide, considering the N-terminal and C-terminal positions, by populating the dictionary. + /// + /// The length of the peptide. + /// A collection of all known fixed modifications. + /// A reference to a dictionary that will hold the fixed modifications, with the key representing the position. + /// + /// This method iterates through all known fixed modifications and assigns them to the appropriate positions in the peptide. + /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. + /// + protected void PopulateFixedModsOneIsNorFivePrimeTerminus(int length, + IEnumerable allKnownFixedModifications, in Dictionary fixedModsOneIsNterminus) { - var fixedModsOneIsNterminus = new Dictionary(length + 3); foreach (Modification mod in allKnownFixedModifications) { switch (mod.LocationRestriction) @@ -76,18 +107,28 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "Oligo 5'-terminal.": case "N-terminal.": case "Peptide N-terminal.": - //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginign of the protein - if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) + //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginning of the protein + if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) { - if (OneBasedStartResidue != 1) + if (mod.ModificationType == "Protease") // Protease N-terminal or 5' modification { - fixedModsOneIsNterminus[2] = mod; + if (OneBasedStartResidue != 1) + fixedModsOneIsNterminus[2] = mod; + } + else if (OneBasedStartResidue == 1) // Modified BioPolymer Start Residue (e.g. Protein N-Terminal) + { + if (!fixedModsOneIsNterminus.TryAdd(1, mod)) // Check if a protein N-terminal mod is already present + { + if (mod.LocationRestriction is "N-terminal." or "5'-terminal.") // Only overwrite if new mod is N-terminal, not peptide N-terminal + { + fixedModsOneIsNterminus[1] = mod; + } + } + } + else //Normal N-terminal peptide modification + { + fixedModsOneIsNterminus[1] = mod; } - } - //Normal N-terminal peptide modification - else if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) - { - fixedModsOneIsNterminus[1] = mod; } break; @@ -106,17 +147,27 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "C-terminal.": case "Peptide C-terminal.": //the modification is protease associated and is applied to the c-terminal cleaved residue, not if it is at the end of the protein - if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) + if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) { - if (OneBasedEndResidue != Parent.Length) + if (mod.ModificationType == "Protease") // Protease N-terminal or 3' modification { - fixedModsOneIsNterminus[length + 1] = mod; + if (OneBasedEndResidue != Parent.Length) + fixedModsOneIsNterminus[length + 1] = mod; + } + else if (OneBasedEndResidue == Parent.Length) // Modified BioPolymer End Residue (e.g. Protein C-Terminal) + { + if (!fixedModsOneIsNterminus.TryAdd(length + 2, mod)) // Check if a protein C-terminal mod is already present + { + if (mod.LocationRestriction is "C-terminal." or "3'-terminal.") // Only overwrite if new mod is C-terminal, not peptide C-terminal + { + fixedModsOneIsNterminus[length + 2] = mod; + } + } + } + else //Normal C-terminal peptide modification + { + fixedModsOneIsNterminus[length + 2] = mod; } - } - //Normal C-terminal peptide modification - else if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) - { - fixedModsOneIsNterminus[length + 2] = mod; } break; @@ -124,11 +175,143 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in throw new NotSupportedException("This terminus localization is not supported."); } } - return fixedModsOneIsNterminus; } + /// + /// Populates the variable modifications dictionary from both the variable modifications and the localized mods from xml reading, + /// considering the N-terminal, C-terminal, and internal positions. + /// + /// A list of all variable modifications. + /// A reference to a dictionary that will hold the variable modifications, with the key representing the position. + /// + /// This method iterates through all variable modifications and assigns them to the appropriate positions in the peptide. + /// It considers different location restrictions such as N-terminal, C-terminal, and anywhere within the peptide. + /// + protected void PopulateVariableModifications(List allVariableMods, in Dictionary> twoBasedDictToPopulate) + { + int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; + var pepNTermVariableMods = new SortedSet(); + twoBasedDictToPopulate.Add(1, pepNTermVariableMods); + + var pepCTermVariableMods = new SortedSet(); + twoBasedDictToPopulate.Add(peptideLength + 2, pepCTermVariableMods); + + // VARIABLE MODS + foreach (Modification variableModification in allVariableMods) + { + // Check if can be a n-term mod + if (CanBeNTerminalOrFivePrime(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Parent, 1, variableModification)) + { + pepNTermVariableMods.Add(variableModification); + } - private static IEnumerable GetVariableModificationPatterns(List>> possibleVariableModifications, + for (int r = 0; r < peptideLength; r++) + { + if (ModificationLocalization.ModFits(variableModification, Parent.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Parent, r + 1, variableModification)) + { + if (!twoBasedDictToPopulate.TryGetValue(r + 2, out var residueVariableMods)) + { + residueVariableMods = new SortedSet() { variableModification }; + twoBasedDictToPopulate.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + } + // Check if can be a c-term mod + if (CanBeCTerminalOrThreePrime(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Parent, peptideLength, variableModification)) + { + pepCTermVariableMods.Add(variableModification); + } + } + + // LOCALIZED MODS + foreach (var kvp in Parent.OneBasedPossibleLocalizedModifications) + { + bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; + if (!inBounds) + { + continue; + } + + int locInPeptide = kvp.Key - OneBasedStartResidue + 1; + foreach (Modification modWithMass in kvp.Value) + { + if (modWithMass is not Modification variableModification) + continue; + + // Check if can be a n-term mod + if (locInPeptide == 1 && CanBeNTerminalOrFivePrime(variableModification, peptideLength) && !Parent.IsDecoy) + { + pepNTermVariableMods.Add(variableModification); + } + + int r = locInPeptide - 1; + if (r >= 0 && r < peptideLength + && (Parent.IsDecoy || + (ModificationLocalization.ModFits(variableModification, Parent.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) + && variableModification.LocationRestriction == "Anywhere."))) + { + if (!twoBasedDictToPopulate.TryGetValue(r + 2, out var residueVariableMods)) + { + residueVariableMods = new SortedSet() { variableModification }; + twoBasedDictToPopulate.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + + // Check if can be a c-term mod + if (locInPeptide == peptideLength && CanBeCTerminalOrThreePrime(variableModification, peptideLength) && !Parent.IsDecoy) + { + pepCTermVariableMods.Add(variableModification); + } + } + } + } + + /// + /// Appends fixed modifications to the variable modification pattern when no variable mod exists. + /// + /// The dictionary containing fixed modifications. + /// The dictionary containing the variable modification pattern. + /// The number of fixed modifications appended. + /// + /// This method iterates through the fixed modifications and adds them to the variable modification pattern + /// if they are not already present. The number of fixed modifications appended is returned via the out parameter. + /// + protected void AppendFixedModificationsToVariable(in Dictionary fixedModDict, in Dictionary variableModPattern, out int numFixedMods) + { + numFixedMods = 0; + foreach (var fixedModPattern in fixedModDict) + { + if (variableModPattern.ContainsKey(fixedModPattern.Key)) + continue; + numFixedMods++; + variableModPattern.Add(fixedModPattern.Key, fixedModPattern.Value); + } + } + + /// + /// Recursively generates all possible variable modification patterns for a peptide. + /// + /// A list of key-value pairs representing possible variable modifications and their positions. + /// The number of unmodified residues desired in the pattern. + /// An array representing the current modification pattern. + /// The current index in the list of possible modifications. + /// An enumerable of arrays representing different modification patterns. The array index corresponds to the location of the modification + /// in the peptide, while the value at that index determines which index in the list of modifications + /// to add to the final variable modification pattern + /// + /// This method uses recursion to generate all possible combinations of variable modifications for a given peptide. + /// It considers both modified and unmodified residues and generates patterns accordingly. + /// + private static IEnumerable GetVariableModificationPatternsRecursive(List>> possibleVariableModifications, int unmodifiedResiduesDesired, int[] variableModificationPattern, int index) { if (index < possibleVariableModifications.Count - 1) @@ -136,7 +319,7 @@ private static IEnumerable GetVariableModificationPatterns(List 0) { variableModificationPattern[possibleVariableModifications[index].Key] = 0; - foreach (int[] new_variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications, + foreach (int[] new_variable_modification_pattern in GetVariableModificationPatternsRecursive(possibleVariableModifications, unmodifiedResiduesDesired - 1, variableModificationPattern, index + 1)) { yield return new_variable_modification_pattern; @@ -147,7 +330,7 @@ private static IEnumerable GetVariableModificationPatterns(List GetVariableModificationPatterns(List GetNewVariableModificationPattern(int[] variableModificationArray, - IEnumerable>> possibleVariableModifications) + /// + /// Determines if a modification can be applied to the N-terminal or 5' end of the peptide. + /// + /// The modification to check. + /// The length of the peptide. + /// True if the modification can be applied to the N-terminal or 5' end; otherwise, false. + private bool CanBeNTerminalOrFivePrime(Modification mod, int peptideLength) { - var modification_pattern = new Dictionary(); - - foreach (KeyValuePair> kvp in possibleVariableModifications) - { - if (variableModificationArray[kvp.Key] > 0) - { - modification_pattern.Add(kvp.Key, kvp.Value[variableModificationArray[kvp.Key] - 1]); - } - } - - return modification_pattern; + return mod.LocationRestriction is "5'-terminal." or "Oligo 5'-terminal." or "N-terminal." or "Peptide N-terminal." + && ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, peptideLength, OneBasedStartResidue); } + /// + /// Determines if a modification can be applied to the C-terminal or 3' end of the peptide. + /// + /// The modification to check. + /// The length of the peptide. + /// True if the modification can be applied to the C-terminal or 3' end; otherwise, false. + private bool CanBeCTerminalOrThreePrime(Modification mod, int peptideLength) + { + return mod.LocationRestriction is "3'-terminal." or "Oligo 3'-terminal." or "C-terminal." or "Peptide C-terminal." + && ModificationLocalization.ModFits(mod, Parent.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1); + } + #endregion } } diff --git a/mzLib/Omics/Fragmentation/FragmentationTerminus.cs b/mzLib/Omics/Fragmentation/FragmentationTerminus.cs index 146309caa..788041690 100644 --- a/mzLib/Omics/Fragmentation/FragmentationTerminus.cs +++ b/mzLib/Omics/Fragmentation/FragmentationTerminus.cs @@ -1,19 +1,12 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace Omics.Fragmentation +namespace Omics.Fragmentation { public enum FragmentationTerminus - { - Both, //N- and C-terminus - N, //N-terminus only - C, //C-terminus only + { + Both, //N- and C-terminus + N, //N-terminus only + C, //C-terminus only None, //used for internal fragments, could be used for top down intact mass? FivePrime, // 5' for NucleicAcids ThreePrime, // 3' for NucleicAcids - } - + } } diff --git a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs index d5b020160..4302fadcb 100644 --- a/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs +++ b/mzLib/Omics/Fragmentation/Oligo/DissociationTypeCollection.cs @@ -1 +1,161 @@ -using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using Chemistry; using MassSpectrometry; namespace Omics.Fragmentation.Oligo { /// /// Methods dealing with specific product type for RNA molecules /// public static class DissociationTypeCollection { /// /// Product Ion types by dissociation method /// private static readonly Dictionary> ProductsFromDissociationType = new Dictionary>() { { DissociationType.Unknown, new List() }, { DissociationType.CID, new List { ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M } }, { DissociationType.LowCID, new List() { } }, { DissociationType.IRMPD, new List() { } }, { DissociationType.ECD, new List { } }, { DissociationType.PQD, new List { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.yWaterLoss, ProductType.d, ProductType.M } }, { DissociationType.ETD, new List { } }, { DissociationType.HCD, new List { ProductType.w, ProductType.y, ProductType.aBaseLoss, ProductType.dWaterLoss, ProductType.M } }, { DissociationType.AnyActivationType, new List { } }, { DissociationType.EThcD, new List { } }, { DissociationType.Custom, new List { } }, { DissociationType.ISCID, new List { } } }; /// /// Returns list of products types based upon the dissociation type /// /// /// public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => ProductsFromDissociationType[dissociationType]; /// /// Mass to be added or subtracted /// private static readonly Dictionary FragmentIonCaps = new Dictionary { { ProductType.a, ChemicalFormula.ParseFormula("H") }, { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.b, ChemicalFormula.ParseFormula("OH") }, { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, { ProductType.w, ChemicalFormula.ParseFormula("H") }, { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 { ProductType.M, new ChemicalFormula() } }; /// /// Returns mass shift by product type /// /// /// public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) { switch (fragmentType) { case ProductType.a: case ProductType.aWaterLoss: case ProductType.aBaseLoss: case ProductType.b: case ProductType.bWaterLoss: case ProductType.bBaseLoss: case ProductType.c: case ProductType.cWaterLoss: case ProductType.cBaseLoss: case ProductType.d: case ProductType.dWaterLoss: case ProductType.dBaseLoss: return FragmentationTerminus.FivePrime; case ProductType.w: case ProductType.wWaterLoss: case ProductType.wBaseLoss: case ProductType.x: case ProductType.xWaterLoss: case ProductType.xBaseLoss: case ProductType.y: case ProductType.yWaterLoss: case ProductType.yBaseLoss: case ProductType.z: case ProductType.zWaterLoss: case ProductType.zBaseLoss: return FragmentationTerminus.ThreePrime; case ProductType.M: return FragmentationTerminus.None; case ProductType.aStar: case ProductType.aDegree: case ProductType.bAmmoniaLoss: case ProductType.yAmmoniaLoss: case ProductType.zPlusOne: case ProductType.D: case ProductType.Ycore: case ProductType.Y: default: throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); } } /// /// Product ion types by Fragmentation Terminus /// private static readonly Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> { { FragmentationTerminus.FivePrime, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, } }, { FragmentationTerminus.ThreePrime, new List { ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, } }, { FragmentationTerminus.Both, new List { ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, ProductType.M } } }; public static List GetRnaTerminusSpecificProductTypes( this FragmentationTerminus fragmentationTerminus) { return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; } /// /// Returns all product ion types based upon specified terminus /// /// /// /// public static List GetRnaTerminusSpecificProductTypesFromDissociation( this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) { var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); return terminusSpecific.Intersect(dissociationSpecific).ToList(); } } } \ No newline at end of file +using Chemistry; +using MassSpectrometry; + +namespace Omics.Fragmentation.Oligo +{ + /// + /// Methods dealing with specific product type for RNA molecules + /// + public static class DissociationTypeCollection + { + /// + /// Product Ion types by dissociation method + /// + /// + /// HCD ions were taken from the following paper: https://www.nature.com/articles/s41598-023-36193-2 + /// Ion types below here should be validated with experimental results. + /// Base and water losses occur very frequently and may also be present in these activation types. + /// CID, UVPD, and aEPD ions were taken from the following paper: https://pubs.acs.org/doi/10.1021/acs.analchem.3c05428?ref=PDF + /// NETD ions were taken from the following paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7161943/ + /// lowCID ions were taken from this Thermo Poster: https://assets.thermofisher.com/TFS-Assets/CMD/Flyers/fl-489263-asms23-optimized-fragmentation-oligonucleotides-suppresses-undesired-fragmentation-fl489263-en.pdf + /// + public static Dictionary> ProductsFromDissociationType = + new Dictionary>() + { + { DissociationType.Unknown, new List() }, + { DissociationType.Custom, new List() }, + { + DissociationType.AnyActivationType, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.aWaterLoss, + ProductType.b, ProductType.bBaseLoss, ProductType.bWaterLoss, + ProductType.c, ProductType.cBaseLoss, ProductType.cWaterLoss, + ProductType.d, ProductType.dBaseLoss, ProductType.dWaterLoss, + ProductType.w, ProductType.wBaseLoss, ProductType.wWaterLoss, + ProductType.x, ProductType.xBaseLoss, ProductType.xWaterLoss, + ProductType.y, ProductType.yBaseLoss, ProductType.yWaterLoss, + ProductType.z, ProductType.zBaseLoss, ProductType.zWaterLoss, + ProductType.M + } + }, + { + DissociationType.CID, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, + ProductType.y, ProductType.yWaterLoss, ProductType.M + } + }, + { + DissociationType.HCD, new List + { + ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, + ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, + ProductType.M + } + }, + { + DissociationType.UVPD, new List + { + ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.M + } + }, + { + DissociationType.aEPD, new List + { + ProductType.a, ProductType.c, ProductType.d, ProductType.w, ProductType.x, ProductType.z, ProductType.M + } + }, + { + DissociationType.NETD, new List + { + ProductType.w, ProductType.d, ProductType.M + } + }, + { + DissociationType.LowCID, new List() + { + ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, + ProductType.y, ProductType.yWaterLoss, ProductType.M + } + }, + { DissociationType.IRMPD, new List() { } }, + { DissociationType.ECD, new List { } }, + { DissociationType.PQD, new List { } }, + { DissociationType.ETD, new List { } }, + { DissociationType.EThcD, new List { } }, + }; + + /// + /// Returns all dissociation types with implemented product type collections + /// + public static IEnumerable AllImplementedDissociationTypes => + ProductsFromDissociationType.Where(p => p.Value.Any()) + .Select(p => p.Key); + + /// + /// Returns list of products types based upon the dissociation type + /// + /// + /// + public static List GetRnaProductTypesFromDissociationType(this DissociationType dissociationType) => + ProductsFromDissociationType[dissociationType]; + + /// + /// Returns mass shift by product type + /// + /// + /// + public static double GetRnaMassShiftFromProductType(this ProductType type) => FragmentIonCaps[type].MonoisotopicMass; + + /// + /// Mass to be added or subtracted + /// + private static readonly Dictionary FragmentIonCaps = + new Dictionary + { + { ProductType.a, ChemicalFormula.ParseFormula("H") }, + { ProductType.aWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, + { ProductType.b, ChemicalFormula.ParseFormula("OH") }, + { ProductType.bWaterLoss, ChemicalFormula.ParseFormula("H-1") }, + { ProductType.c, ChemicalFormula.ParseFormula("O3H2P") }, + { ProductType.cWaterLoss, ChemicalFormula.ParseFormula("O2P") }, + { ProductType.d, ChemicalFormula.ParseFormula("O4H2P") }, + { ProductType.dWaterLoss, ChemicalFormula.ParseFormula("O3P") }, + + { ProductType.w, ChemicalFormula.ParseFormula("H") }, + { ProductType.wWaterLoss, ChemicalFormula.ParseFormula("H-1O-1") }, + { ProductType.x, ChemicalFormula.ParseFormula("O-1H") }, + { ProductType.xWaterLoss, ChemicalFormula.ParseFormula("O-2H-1") }, + { ProductType.y, ChemicalFormula.ParseFormula("O-3P-1") }, + { ProductType.yWaterLoss, ChemicalFormula.ParseFormula("O-4H-2P-1") }, + { ProductType.z, ChemicalFormula.ParseFormula("O-4P-1") }, + { ProductType.zWaterLoss, ChemicalFormula.ParseFormula("O-5H-2P-1") }, + //fragment - Base chemical formula is the corresponding fragment chemical formula subtracing 1 H as H is lost when base is removed + { ProductType.aBaseLoss, ChemicalFormula.ParseFormula("H-2") }, // "H-1" -H + { ProductType.bBaseLoss, ChemicalFormula.ParseFormula("O1H-2") }, //"OH1" -H + { ProductType.cBaseLoss, ChemicalFormula.ParseFormula("O3H-1P") }, //"O3P" -H + { ProductType.dBaseLoss, ChemicalFormula.ParseFormula("O4H-1P") }, //"O4H2P" -H + + { ProductType.wBaseLoss, ChemicalFormula.ParseFormula("H-2") }, //"H"-H + { ProductType.xBaseLoss, ChemicalFormula.ParseFormula("O-1H-2") }, //"O-1H" -H + { ProductType.yBaseLoss, ChemicalFormula.ParseFormula("O-3H-2P-1") }, //"O-3P-1" -H + { ProductType.zBaseLoss, ChemicalFormula.ParseFormula("O-4H-3P-1") }, //"O-4H-1P-1" -1 + + { ProductType.M, new ChemicalFormula() } + }; + + /// + /// Returns all product ion types based upon specified terminus + /// + /// + /// + /// + public static List GetRnaTerminusSpecificProductTypesFromDissociation( + this DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) + { + var terminusSpecific = fragmentationTerminus.GetRnaTerminusSpecificProductTypes(); + var dissociationSpecific = dissociationType.GetRnaProductTypesFromDissociationType(); + return terminusSpecific.Intersect(dissociationSpecific).ToList(); + } + } +} diff --git a/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs b/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs new file mode 100644 index 000000000..0ec5541cd --- /dev/null +++ b/mzLib/Omics/Fragmentation/Oligo/TerminusSpecificProductTypes.cs @@ -0,0 +1,141 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Omics.Fragmentation.Oligo +{ + public static class TerminusSpecificProductTypes + { + public static List GetRnaTerminusSpecificProductTypes( + this FragmentationTerminus fragmentationTerminus) + { + return ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]; + } + + /// + /// The types of ions that can be generated from an oligo fragment, based on the terminus of the fragment + /// + public static Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> + { + { + FragmentationTerminus.FivePrime, new List + { + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + } + }, + { + FragmentationTerminus.ThreePrime, new List + { + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + } + }, + { + FragmentationTerminus.Both, new List + { + + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + ProductType.M + } + + }, + { + FragmentationTerminus.None, new List() + } + }; + + + public static FragmentationTerminus GetRnaTerminusType(this ProductType fragmentType) + { + switch (fragmentType) + { + case ProductType.a: + case ProductType.aWaterLoss: + case ProductType.aBaseLoss: + case ProductType.b: + case ProductType.bWaterLoss: + case ProductType.bBaseLoss: + case ProductType.c: + case ProductType.cWaterLoss: + case ProductType.cBaseLoss: + case ProductType.d: + case ProductType.dWaterLoss: + case ProductType.dBaseLoss: + case ProductType.w: + case ProductType.wWaterLoss: + case ProductType.wBaseLoss: + case ProductType.x: + case ProductType.xWaterLoss: + case ProductType.xBaseLoss: + case ProductType.y: + case ProductType.yWaterLoss: + case ProductType.yBaseLoss: + case ProductType.z: + case ProductType.zWaterLoss: + case ProductType.zBaseLoss: + case ProductType.M: + return ProductTypeToFragmentationTerminus[fragmentType]; + + case ProductType.aStar: + case ProductType.aDegree: + case ProductType.bAmmoniaLoss: + case ProductType.yAmmoniaLoss: + case ProductType.zPlusOne: + case ProductType.D: + case ProductType.Ycore: + case ProductType.Y: + default: + throw new ArgumentOutOfRangeException(nameof(fragmentType), fragmentType, null); + } + } + + + /// + /// The terminus of the oligo fragment that the product ion is generated from + /// + public static Dictionary ProductTypeToFragmentationTerminus = new Dictionary + { + { ProductType.a, FragmentationTerminus.FivePrime }, + { ProductType.aWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.aBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.b, FragmentationTerminus.FivePrime }, + { ProductType.bWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.bBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.c, FragmentationTerminus.FivePrime }, + { ProductType.cWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.cBaseLoss, FragmentationTerminus.FivePrime }, + { ProductType.d, FragmentationTerminus.FivePrime }, + { ProductType.dWaterLoss, FragmentationTerminus.FivePrime }, + { ProductType.dBaseLoss, FragmentationTerminus.FivePrime }, + + { ProductType.w, FragmentationTerminus.ThreePrime }, + { ProductType.wWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.wBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.x, FragmentationTerminus.ThreePrime }, + { ProductType.xWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.xBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.y, FragmentationTerminus.ThreePrime }, + { ProductType.yWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.yBaseLoss, FragmentationTerminus.ThreePrime }, + { ProductType.z, FragmentationTerminus.ThreePrime }, + { ProductType.zWaterLoss, FragmentationTerminus.ThreePrime }, + { ProductType.zBaseLoss, FragmentationTerminus.ThreePrime }, + + { ProductType.M, FragmentationTerminus.Both } + }; + } +} diff --git a/mzLib/Omics/IBioPolymer.cs b/mzLib/Omics/IBioPolymer.cs index 1cb8aa09a..43ceea17d 100644 --- a/mzLib/Omics/IBioPolymer.cs +++ b/mzLib/Omics/IBioPolymer.cs @@ -3,7 +3,7 @@ namespace Omics { - public interface IBioPolymer + public interface IBioPolymer : IEquatable { string Name { get; } string FullName { get; } @@ -29,5 +29,14 @@ public interface IBioPolymer IEnumerable Digest(IDigestionParams digestionParams, List allKnownFixedModifications, List variableModifications, List silacLabels = null, (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, bool topDownTruncationSearch = false); + + bool IEquatable.Equals(IBioPolymer? other) + { + if (other is null) return false; + if (ReferenceEquals(this, other)) return true; + if (other.GetType() != GetType()) return false; + return Accession == other.Accession + && BaseSequence == other.BaseSequence; + } } } diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs index 1c3ade66a..47186b89f 100644 --- a/mzLib/Omics/IBioPolymerWithSetMods.cs +++ b/mzLib/Omics/IBioPolymerWithSetMods.cs @@ -14,7 +14,7 @@ namespace Omics /// Proteins -> PeptideWithSetModifications : ProteolyticPeptide /// Nucleic Acids -> OligoWithSetMods : NucleolyticOligo /// - public interface IBioPolymerWithSetMods : IHasChemicalFormula + public interface IBioPolymerWithSetMods : IHasChemicalFormula, IEquatable { string BaseSequence { get; } string FullSequence { get; } @@ -50,7 +50,16 @@ public void Fragment(DissociationType dissociationType, FragmentationTerminus fr public void FragmentInternally(DissociationType dissociationType, int minLengthOfFragments, List products); - public IBioPolymerWithSetMods Localize(int j, double massToLocalize); + /// + /// Outputs a duplicate IBioPolymerWithSetMods with a localized mass shift, replacing a modification when present + /// + /// Used to localize an unknown mass shift in the MetaMorpheus Localization Engine + /// + /// + /// The index of the modification in the AllModOneIsNTerminus Dictionary - 2 (idk why -2) + /// The mass to add to the BioPolymer + /// + public IBioPolymerWithSetMods Localize(int indexOfMass, double massToLocalize); public static string GetBaseSequenceFromFullSequence(string fullSequence) { @@ -73,5 +82,86 @@ public static string GetBaseSequenceFromFullSequence(string fullSequence) } return sb.ToString(); } + + /// + /// Returns a list of modifications and their OneBased index from a full sequence + /// + /// Full sequence + /// All known modifications + /// + /// When a full sequence is not in the correct format or a mod is not found in the allModsKnown dictionary + public static Dictionary GetModificationDictionaryFromFullSequence(string fullSequence, + Dictionary allModsKnown) + { + var allModsOneIsNterminus = new Dictionary(); + var baseSequence = GetBaseSequenceFromFullSequence(fullSequence); + int currentModStart = 0; + int currentModificationLocation = 1; + bool currentlyReadingMod = false; + int bracketCount = 0; + + for (int r = 0; r < fullSequence.Length; r++) + { + char c = fullSequence[r]; + if (c == '[') + { + currentlyReadingMod = true; + if (bracketCount == 0) + { + currentModStart = r + 1; + } + bracketCount++; + } + else if (c == ']') + { + string modId = null; + bracketCount--; + if (bracketCount == 0) + { + try + { + //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") + string modString = fullSequence.Substring(currentModStart, r - currentModStart); + int splitIndex = modString.IndexOf(':'); + string modType = modString.Substring(0, splitIndex); + modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); + } + catch (Exception e) + { + throw new MzLibUtil.MzLibException( + "Error while trying to parse string into peptide: " + e.Message, e); + + } + if (!allModsKnown.TryGetValue(modId, out var mod)) + { + throw new MzLibUtil.MzLibException( + "Could not find modification while reading string: " + fullSequence); + } + if (mod.LocationRestriction.Contains("C-terminal.") && r == fullSequence.Length - 1) + { + currentModificationLocation = baseSequence.Length + 2; + } + allModsOneIsNterminus.Add(currentModificationLocation, mod); + currentlyReadingMod = false; + } + } + else if (!currentlyReadingMod) + { + currentModificationLocation++; + } + //else do nothing + } + + return allModsOneIsNterminus; + } + + /// + /// Returns a list of modifications from a full sequence + /// + /// Full sequence + /// All known modifications + /// + public static List GetModificationsFromFullSequence(string fullSequence, + Dictionary allModsKnown) => [.. GetModificationDictionaryFromFullSequence(fullSequence, allModsKnown).Values]; } } diff --git a/mzLib/Omics/Modifications/Modification.cs b/mzLib/Omics/Modifications/Modification.cs index 5b2beaa81..302bdc936 100644 --- a/mzLib/Omics/Modifications/Modification.cs +++ b/mzLib/Omics/Modifications/Modification.cs @@ -1,10 +1,5 @@ using Chemistry; using MassSpectrometry; -using Omics.Modifications; -using System; -using System.Collections.Generic; -using System.Globalization; -using System.Linq; using System.Text; namespace Omics.Modifications @@ -13,7 +8,7 @@ namespace Omics.Modifications /// Represents a modification /// Mods.txt format was taken from https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/docs/ptmlist.txt /// - public class Modification + public class Modification : IComparable { public string IdWithMotif { get; private set; } public string OriginalId { get; private set; } @@ -299,5 +294,24 @@ public string ModificationErrorsToString() //reports errors in required fields. return sb.ToString(); } + + + // Used in the sorted sets for variable mod generation to ensure that modifications are consistently ordered + // UniProt annotations also contain an evidence level. Future work could include this in the ordering of modifications for digestion. + public int CompareTo(Modification? other) + { + if (other == null) return 1; + + int idComparison = string.Compare(this.IdWithMotif, other.IdWithMotif, StringComparison.Ordinal); + if (idComparison != 0) return idComparison; + + int typeComparison = string.Compare(this.ModificationType, other.ModificationType, StringComparison.Ordinal); + if (typeComparison != 0) return typeComparison; + + int locRestrictionComparison = string.Compare(this.LocationRestriction, other.LocationRestriction, StringComparison.Ordinal); + if (locRestrictionComparison != 0) return locRestrictionComparison; + + return Nullable.Compare(this.MonoisotopicMass, other.MonoisotopicMass); + } } } \ No newline at end of file diff --git a/mzLib/Omics/Modifications/ModificationLocalization.cs b/mzLib/Omics/Modifications/ModificationLocalization.cs index bbf25d1a3..01dadaa18 100644 --- a/mzLib/Omics/Modifications/ModificationLocalization.cs +++ b/mzLib/Omics/Modifications/ModificationLocalization.cs @@ -2,19 +2,28 @@ { public static class ModificationLocalization { + // This method is called a ton (8.8 billion times in Bottom-Up Jenkins as of 1.0.6) in MetaMorpheus. If changes are made, ensure they are efficient. public static bool ModFits(Modification attemptToLocalize, string sequence, int digestionProductOneBasedIndex, int digestionProductLength, int bioPolymerOneBasedIndex) { // First find the capital letter... - var motif = attemptToLocalize.Target; - var motifStartLocation = motif.ToString().IndexOf(motif.ToString().First(b => char.IsUpper(b))); + var motif = attemptToLocalize.Target.ToString(); + var motifStartLocation = -1; + for (int i = 0; i < motif.Length; i++) + { + if (!char.IsUpper(motif[i])) + continue; + + motifStartLocation = i; + break; + } // Look up starting at and including the capital letter var proteinToMotifOffset = bioPolymerOneBasedIndex - motifStartLocation - 1; var indexUp = 0; - while (indexUp < motif.ToString().Length) + while (indexUp < motif.Length) { if (indexUp + proteinToMotifOffset < 0 || indexUp + proteinToMotifOffset >= sequence.Length - || !MotifMatches(motif.ToString()[indexUp], sequence[indexUp + proteinToMotifOffset])) + || !MotifMatches(motif[indexUp], sequence[indexUp + proteinToMotifOffset])) { return false; } @@ -22,18 +31,17 @@ public static bool ModFits(Modification attemptToLocalize, string sequence, int } switch (attemptToLocalize.LocationRestriction) { + // Only the intact (undigested) terminus case "N-terminal." when bioPolymerOneBasedIndex > 2: - case "Peptide N-terminal." when digestionProductOneBasedIndex > 1: - case "C-terminal." when bioPolymerOneBasedIndex < sequence.Length: - case "Peptide C-terminal." when digestionProductOneBasedIndex < digestionProductLength: case "5'-terminal." when bioPolymerOneBasedIndex > 2: - // first residue in oligo but not first in nucleic acid - case "Oligo 5'-terminal." when digestionProductOneBasedIndex > 1 - || bioPolymerOneBasedIndex == 1: + case "C-terminal." when bioPolymerOneBasedIndex < sequence.Length: case "3'-terminal." when bioPolymerOneBasedIndex < sequence.Length: - // not the last residue in oligo but not in nucleic acid - case "Oligo 3'-terminal." when digestionProductOneBasedIndex < digestionProductLength - || bioPolymerOneBasedIndex == sequence.Length: + + // All Digested Termini AND original undigested termini + case "Peptide N-terminal." when digestionProductOneBasedIndex > 1: + case "Oligo 5'-terminal." when digestionProductOneBasedIndex > 1: + case "Peptide C-terminal." when digestionProductOneBasedIndex < digestionProductLength: + case "Oligo 3'-terminal." when digestionProductOneBasedIndex < digestionProductLength: return false; default: @@ -56,11 +64,14 @@ public static bool UniprotModExists(IBioPolymer bioPolymer, int i, Modification private static bool MotifMatches(char motifChar, char sequenceChar) { char upperMotifChar = char.ToUpper(motifChar); - return upperMotifChar.Equals('X') - || upperMotifChar.Equals(sequenceChar) - || upperMotifChar.Equals('B') && new[] { 'D', 'N' }.Contains(sequenceChar) - || upperMotifChar.Equals('J') && new[] { 'I', 'L' }.Contains(sequenceChar) - || upperMotifChar.Equals('Z') && new[] { 'E', 'Q' }.Contains(sequenceChar); + return upperMotifChar switch + { + 'X' => true, + 'B' => sequenceChar is 'D' or 'N', + 'J' => sequenceChar is 'I' or 'L', + 'Z' => sequenceChar is 'E' or 'Q', + _ => upperMotifChar == sequenceChar + }; } } } \ No newline at end of file diff --git a/mzLib/Omics/Modifications/ModificationMotif.cs b/mzLib/Omics/Modifications/ModificationMotif.cs index 4e0833dda..0e77c087d 100644 --- a/mzLib/Omics/Modifications/ModificationMotif.cs +++ b/mzLib/Omics/Modifications/ModificationMotif.cs @@ -28,19 +28,6 @@ public static bool TryGetMotif(string motifString, out ModificationMotif motif) } return false; } - // Commented out by AVC on 4/5/23. Methods were unused and untested - // since 2017. - // public override bool Equals(object o) - // { - // ModificationMotif m = o as ModificationMotif; - // return m != null - // && m.motifString == motifString; - // } - // - // public override int GetHashCode() - // { - // return motifString.GetHashCode(); - // } public override string ToString() { diff --git a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs index a96be9e0c..f81700b59 100644 --- a/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs +++ b/mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs @@ -4,6 +4,7 @@ using System.Text.RegularExpressions; using Chemistry; using Omics.Fragmentation.Peptide; +using MzLibUtil; namespace Omics.SpectrumMatch { @@ -92,53 +93,15 @@ public static string RemoveParentheses(string baseSequence) } /// - /// Parses the full sequence to identify mods + /// Parses the full sequence to identify mods. /// - /// Full sequence of the peptide in question + /// Full sequence of the peptide in question + /// If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing). + /// If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid. /// Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod - public static Dictionary> ParseModifications(string fullSeq) + public static Dictionary> ParseModifications(string fullSeq, bool modOnNTerminus=true, bool modOnCTerminus=true) { - // use a regex to get all modifications - string pattern = @"\[(.+?)\]"; - Regex regex = new(pattern); - - // remove each match after adding to the dict. Otherwise, getting positions - // of the modifications will be rather difficult. - //int patternMatches = regex.Matches(fullSeq).Count; - Dictionary> modDict = new(); - - RemoveSpecialCharacters(ref fullSeq); - MatchCollection matches = regex.Matches(fullSeq); - int currentPosition = 0; - foreach (Match match in matches) - { - GroupCollection group = match.Groups; - string val = group[1].Value; - int startIndex = group[0].Index; - int captureLength = group[0].Length; - int position = group["(.+?)"].Index; - - List modList = new List(); - modList.Add(val); - // check to see if key already exist - // if there is a missed cleavage, then there will be a label on K and a Label on X modification. - // And, it'll be like [label]|[label] which complicates the positional stuff a little bit. - // if the already key exists, update the current position with the capture length + 1. - // otherwise, add the modification to the dict. - - // int to add is startIndex - current position - int positionToAddToDict = startIndex - currentPosition; - if (modDict.ContainsKey(positionToAddToDict)) - { - modDict[positionToAddToDict].Add(val); - } - else - { - modDict.Add(positionToAddToDict, modList); - } - currentPosition += startIndex + captureLength; - } - return modDict; + return fullSeq.ParseModifications(modOnNTerminus, modOnCTerminus); } /// @@ -150,9 +113,7 @@ public static Dictionary> ParseModifications(string fullSeq) /// public static void RemoveSpecialCharacters(ref string fullSeq, string replacement = @"", string specialCharacter = @"\|") { - // next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K) - Regex regexSpecialChar = new(specialCharacter); - fullSeq = regexSpecialChar.Replace(fullSeq, replacement); + MzLibUtil.ClassExtensions.RemoveSpecialCharacters(ref fullSeq, replacement, specialCharacter); } diff --git a/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs b/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs index 1d7f1b231..1abb40e99 100644 --- a/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs +++ b/mzLib/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs @@ -1103,7 +1103,7 @@ private void ParseSequence(string sequence) { modification = new OldSchoolChemicalFormulaModification(ChemicalFormula.ParseFormula(modString)); } - catch (MzLibException) + catch (MzLibException e) { if (double.TryParse(modString, out double mass)) { @@ -1111,7 +1111,7 @@ private void ParseSequence(string sequence) } else { - throw new MzLibException("Unable to correctly parse the following modification: " + modString); + throw new MzLibException("Unable to correctly parse the following modification: " + modString, e); } } diff --git a/mzLib/Proteomics/PSM/PsmFromTsv.cs b/mzLib/Proteomics/PSM/PsmFromTsv.cs index 95605ab49..5837c745d 100644 --- a/mzLib/Proteomics/PSM/PsmFromTsv.cs +++ b/mzLib/Proteomics/PSM/PsmFromTsv.cs @@ -259,6 +259,5 @@ public PsmFromTsv(PsmFromTsv psm, string fullSequence, int index = 0, string bas LocalizedGlycan = psm.LocalizedGlycan; } - } } diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index fc07460d2..0053a20d7 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -12,7 +12,7 @@ namespace Proteomics { - public class Protein : IBioPolymer + public class Protein : IBioPolymer, IEquatable { private List _proteolysisProducts; @@ -969,10 +969,18 @@ public int CompareTo(Protein other) //not sure if we require any additional fields for equality public override bool Equals(object obj) { - Protein otherProtein = (Protein)obj; - return otherProtein != null && otherProtein.Accession.Equals(Accession) && otherProtein.BaseSequence.Equals(BaseSequence); + if (obj is Protein bioPol) + { + return Equals(bioPol); + } + return false; } + public bool Equals(Protein other) + { + return (this as IBioPolymer).Equals(other); + } + /// /// The protein object uses the default hash code method for speed, /// but note that two protein objects with the same information will give two different hash codes. diff --git a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs index 1b7d32d61..d3ca8dc24 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -14,9 +14,9 @@ namespace Proteomics.ProteolyticDigestion { [Serializable] - public class PeptideWithSetModifications : ProteolyticPeptide, IBioPolymerWithSetMods + public class PeptideWithSetModifications : ProteolyticPeptide, IBioPolymerWithSetMods, IEquatable { - public string FullSequence { get; private set; } //sequence with modifications + public string FullSequence { get; init; } //sequence with modifications public int NumFixedMods { get; } // Parameter to store the full sequence of the corresponding Target or Decoy peptide // If the peptide in question is a decoy, this pairs it to the target it was generated from @@ -69,7 +69,7 @@ public PeptideWithSetModifications(string sequence, Dictionary(AllModsOneIsNterminus); double massOfExistingMod = 0; - if (dictWithLocalizedMass.TryGetValue(j + 2, out Modification modToReplace)) + if (dictWithLocalizedMass.TryGetValue(indexOfMass + 2, out Modification modToReplace)) { massOfExistingMod = (double)modToReplace.MonoisotopicMass; - dictWithLocalizedMass.Remove(j + 2); + dictWithLocalizedMass.Remove(indexOfMass + 2); } - dictWithLocalizedMass.Add(j + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); + dictWithLocalizedMass.Add(indexOfMass + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); var peptideWithLocalizedMass = new PeptideWithSetModifications(Protein, _digestionParams, OneBasedStartResidueInProtein, OneBasedEndResidueInProtein, CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, dictWithLocalizedMass, NumFixedMods); @@ -884,33 +884,69 @@ public override string ToString() return FullSequence + string.Join("\t", AllModsOneIsNterminus.Select(m => m.ToString())); } + #region IEquatable + + /// + /// Peptides are equal if they have the same full sequence, parent, and digestion agent + /// public override bool Equals(object obj) { - var q = obj as PeptideWithSetModifications; - - if (Protein == null && q.Protein == null) + if (obj is PeptideWithSetModifications peptide) { - return q.FullSequence.Equals(this.FullSequence); + return Equals(peptide); } + return false; + } - return q != null - && q.FullSequence.Equals(this.FullSequence) - && q.OneBasedStartResidue == this.OneBasedStartResidue - && (q.Protein.Accession == null && this.Protein.Accession == null || q.Protein.Accession.Equals(this.Protein.Accession)) - && q.DigestionParams.DigestionAgent.Equals(this.DigestionParams.DigestionAgent); + /// + /// Peptides are equal if they have the same full sequence, parent, and digestion agent + /// + public bool Equals(IBioPolymerWithSetMods other) => Equals(other as PeptideWithSetModifications); + + /// + /// Peptides are equal if they have the same full sequence, parent, and digestion agent + /// + public bool Equals(PeptideWithSetModifications other) + { + if (other is null) return false; + if (ReferenceEquals(this, other)) return true; + if (other.GetType() != GetType()) return false; + + // for those constructed from sequence and mods only + if (Parent is null && other.Parent is null) + return FullSequence.Equals(other.FullSequence); + + return FullSequence == other.FullSequence + && Equals(DigestionParams?.DigestionAgent, other.DigestionParams?.DigestionAgent) + // These last two are important for parsimony in MetaMorpheus + && OneBasedStartResidue == other!.OneBasedStartResidue + && Equals(Parent?.Accession, other.Parent?.Accession); } public override int GetHashCode() { - return FullSequence.GetHashCode(); + var hash = new HashCode(); + hash.Add(FullSequence); + hash.Add(OneBasedStartResidue); + if (Parent?.Accession != null) + { + hash.Add(Parent.Accession); + } + if (DigestionParams?.DigestionAgent != null) + { + hash.Add(DigestionParams.DigestionAgent); + } + return hash.ToHashCode(); } + #endregion + /// /// This should be run after deserialization of a PeptideWithSetModifications, in order to set its Protein and Modification objects, which were not serialized /// public void SetNonSerializedPeptideInfo(Dictionary idToMod, Dictionary accessionToProtein, DigestionParams dp) { - GetModsAfterDeserialization(idToMod); + _allModsOneIsNterminus = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(FullSequence, idToMod); GetProteinAfterDeserialization(accessionToProtein); _digestionParams = dp; } @@ -919,66 +955,6 @@ public void SetNonSerializedPeptideInfo(Dictionary idToMod Dictionary accessionToProtein, IDigestionParams dp) => SetNonSerializedPeptideInfo(idToMod, accessionToProtein, (DigestionParams)dp); - private void GetModsAfterDeserialization(Dictionary idToMod) - { - _allModsOneIsNterminus = new Dictionary(); - int currentModStart = 0; - int currentModificationLocation = 1; - bool currentlyReadingMod = false; - int bracketCount = 0; - - for (int r = 0; r < FullSequence.Length; r++) - { - char c = FullSequence[r]; - if (c == '[') - { - currentlyReadingMod = true; - if (bracketCount == 0) - { - currentModStart = r + 1; - } - bracketCount++; - } - else if (c == ']') - { - string modId = null; - bracketCount--; - if (bracketCount == 0) - { - try - { - //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") - string modString = FullSequence.Substring(currentModStart, r - currentModStart); - int splitIndex = modString.IndexOf(':'); - string modType = modString.Substring(0, splitIndex); - modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); - } - catch (Exception e) - { - throw new MzLibUtil.MzLibException( - "Error while trying to parse string into peptide: " + e.Message); - } - if (!idToMod.TryGetValue(modId, out Modification mod)) - { - throw new MzLibUtil.MzLibException( - "Could not find modification while reading string: " + FullSequence); - } - if (mod.LocationRestriction.Contains("C-terminal.") && r == FullSequence.Length - 1) - { - currentModificationLocation = BaseSequence.Length + 2; - } - _allModsOneIsNterminus.Add(currentModificationLocation, mod); - currentlyReadingMod = false; - } - } - else if (!currentlyReadingMod) - { - currentModificationLocation++; - } - //else do nothing - } - } - private void GetProteinAfterDeserialization(Dictionary idToProtein) { Protein protein = null; diff --git a/mzLib/Proteomics/ProteolyticDigestion/Protease.cs b/mzLib/Proteomics/ProteolyticDigestion/Protease.cs index 5bca90400..9ad375330 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/Protease.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/Protease.cs @@ -24,17 +24,6 @@ public override string ToString() return Name; } - public override bool Equals(object obj) - { - return obj is Protease a - && (a.Name == null && Name == null || a.Name.Equals(Name)); - } - - public override int GetHashCode() - { - return (Name ?? "").GetHashCode(); - } - /// /// This method is used to determine cleavage specificity if the cleavage specificity is unknown /// This occurs in the speedy nonspecific/semispecific searches when digesting post-search @@ -87,69 +76,27 @@ public CleavageSpecificity GetCleavageSpecificity(Protein protein, int startInde /// /// /// - internal List GetUnmodifiedPeptides(Protein protein, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, + internal IEnumerable GetUnmodifiedPeptides(Protein protein, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, int minPeptideLength, int maxPeptideLength, Protease specificProtease, bool topDownTruncationSearch = false) { - List peptides = new List(); - - // proteolytic cleavage in one spot (N) - if (CleavageSpecificity == CleavageSpecificity.SingleN) - { - peptides = SingleN_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease); - } - - // proteolytic cleavage in one spot (C) - else if (CleavageSpecificity == CleavageSpecificity.SingleC) - { - peptides = SingleC_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease); - } - - //top-down - else if (CleavageSpecificity == CleavageSpecificity.None) + return CleavageSpecificity switch { - if (!topDownTruncationSearch)//standard top-down - { - // retain methionine - if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Cleave || protein[0] != 'M') - && ValidLength(protein.Length, minPeptideLength, maxPeptideLength)) - { - peptides.Add(new ProteolyticPeptide(protein, 1, protein.Length, 0, CleavageSpecificity.Full, "full")); - } + // proteolytic cleavage in one spot (N) + CleavageSpecificity.SingleN => SingleN_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease), - // cleave methionine - if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Retain && protein[0] == 'M') - && ValidLength(protein.Length - 1, minPeptideLength, maxPeptideLength)) - { - peptides.Add(new ProteolyticPeptide(protein, 2, protein.Length, 0, CleavageSpecificity.Full, "full:M cleaved")); - } - } + // proteolytic cleavage in one spot (C) + CleavageSpecificity.SingleC => SingleC_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease), - // Also digest using the proteolysis product start/end indices - peptides.AddRange( - protein.ProteolysisProducts - .Where(proteolysisProduct => proteolysisProduct.OneBasedEndPosition.HasValue && proteolysisProduct.OneBasedBeginPosition.HasValue - && ValidLength(proteolysisProduct.OneBasedEndPosition.Value - proteolysisProduct.OneBasedBeginPosition.Value + 1, minPeptideLength, maxPeptideLength)) - .Select(proteolysisProduct => - new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, 0, CleavageSpecificity.None, proteolysisProduct.Type))); - } + //top-down + CleavageSpecificity.None => TopDownDigestion(protein, initiatorMethionineBehavior, minPeptideLength, maxPeptideLength, topDownTruncationSearch), - // Full proteolytic cleavage - else if (CleavageSpecificity == CleavageSpecificity.Full) - { - peptides.AddRange(FullDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength)); - } + // Full proteolytic cleavage + CleavageSpecificity.Full => FullDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength), - // Cleavage rules for semi-specific search - else if (CleavageSpecificity == CleavageSpecificity.Semi) - { - peptides.AddRange(SemiProteolyticDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength)); - } - else - { - throw new NotImplementedException(); - } - - return peptides; + // Cleavage rules for semi-specific search + CleavageSpecificity.Semi => SemiProteolyticDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength), + _ => throw new NotImplementedException() + }; } /// @@ -292,6 +239,46 @@ private IEnumerable FullDigestion(Protein protein, Initiator } } + /// + /// Gets protein intervals for top-down digestion. + /// + /// + /// + /// + /// + /// + /// + private IEnumerable TopDownDigestion(Protein protein, InitiatorMethionineBehavior initiatorMethionineBehavior, + int minPeptideLength, int maxPeptideLength, bool topDownTruncationSearch) + { + if (!topDownTruncationSearch) // standard top-down + { + // retain methionine + if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Cleave || protein[0] != 'M') + && ValidLength(protein.Length, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, 1, protein.Length, 0, CleavageSpecificity.Full, "full"); + } + + // cleave methionine + if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Retain && protein[0] == 'M') + && ValidLength(protein.Length - 1, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, 2, protein.Length, 0, CleavageSpecificity.Full, "full:M cleaved"); + } + } + + // Also digest using the proteolysis product start/end indices + foreach (var proteolysisProduct in protein.ProteolysisProducts) + { + if (proteolysisProduct.OneBasedEndPosition.HasValue && proteolysisProduct.OneBasedBeginPosition.HasValue + && ValidLength(proteolysisProduct.OneBasedEndPosition.Value - proteolysisProduct.OneBasedBeginPosition.Value + 1, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, 0, CleavageSpecificity.None, proteolysisProduct.Type); + } + } + } + /// /// Gets the protein intervals based on semiSpecific digestion rules /// This is the classic, slow semi-specific digestion that generates each semi-specific peptide pre-search diff --git a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs index 954ce449a..615a3618d 100644 --- a/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs +++ b/mzLib/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -1,7 +1,5 @@ using System; using System.Collections.Generic; -using System.Linq; -using System.Security.Cryptography; using Omics.Digestion; using Omics.Modifications; @@ -14,7 +12,6 @@ namespace Proteomics.ProteolyticDigestion [Serializable] public class ProteolyticPeptide : DigestionProduct { - internal ProteolyticPeptide(Protein protein, int oneBasedStartResidueInProtein, int oneBasedEndResidueInProtein, int missedCleavages, CleavageSpecificity cleavageSpecificityForFdrCategory, string peptideDescription = null, string baseSequence = null) : base(protein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificityForFdrCategory, peptideDescription, baseSequence) { @@ -51,142 +48,40 @@ public string PeptideDescription /// /// /// - internal IEnumerable GetModifiedPeptides(IEnumerable allKnownFixedModifications, + internal IEnumerable GetModifiedPeptides(List allKnownFixedModifications, DigestionParams digestionParams, List variableModifications) { + int variable_modification_isoforms = 0; int peptideLength = OneBasedEndResidue - OneBasedStartResidue + 1; int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; int maxModsForPeptide = digestionParams.MaxModsForPeptide; - var twoBasedPossibleVariableAndLocalizeableModifications = new Dictionary>(peptideLength + 4); - - var pepNTermVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(1, pepNTermVariableMods); + var twoBasedPossibleVariableAndLocalizeableModifications = DictionaryPool.Get(); + var fixedModDictionary = FixedModDictionaryPool.Get(); - var pepCTermVariableMods = new List(); - twoBasedPossibleVariableAndLocalizeableModifications.Add(peptideLength + 2, pepCTermVariableMods); - - foreach (Modification variableModification in variableModifications) + try { - // Check if can be a n-term mod - if (CanBeNTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, 1, variableModification)) - { - pepNTermVariableMods.Add(variableModification); - } + PopulateVariableModifications(variableModifications, in twoBasedPossibleVariableAndLocalizeableModifications); + PopulateFixedModsOneIsNorFivePrimeTerminus(peptideLength, allKnownFixedModifications, in fixedModDictionary); - for (int r = 0; r < peptideLength; r++) - { - if (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidue + r) - && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Protein, r + 1, variableModification)) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - } - // Check if can be a c-term mod - if (CanBeCTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, peptideLength, variableModification)) + foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) { - pepCTermVariableMods.Add(variableModification); - } - } + AppendFixedModificationsToVariable(in fixedModDictionary, in variableModPattern, out int numFixedMods); - // LOCALIZED MODS - foreach (var kvp in Protein.OneBasedPossibleLocalizedModifications) - { - bool inBounds = kvp.Key >= OneBasedStartResidue && kvp.Key <= OneBasedEndResidue; - if (!inBounds) - { - continue; - } + yield return new PeptideWithSetModifications(Protein, digestionParams, OneBasedStartResidue, OneBasedEndResidue, + CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, variableModPattern, numFixedMods); - int locInPeptide = kvp.Key - OneBasedStartResidueInProtein + 1; - foreach (Modification modWithMass in kvp.Value) - { - if (modWithMass is Modification variableModification) + variable_modification_isoforms++; + if (variable_modification_isoforms == maximumVariableModificationIsoforms) { - // Check if can be a n-term mod - if (locInPeptide == 1 && CanBeNTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) - { - pepNTermVariableMods.Add(variableModification); - } - - int r = locInPeptide - 1; - if (r >= 0 && r < peptideLength - && (Protein.IsDecoy || - (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidueInProtein + r) - && variableModification.LocationRestriction == "Anywhere."))) - { - if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) - { - residueVariableMods = new List { variableModification }; - twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); - } - else - { - residueVariableMods.Add(variableModification); - } - } - - // Check if can be a c-term mod - if (locInPeptide == peptideLength && CanBeCTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) - { - pepCTermVariableMods.Add(variableModification); - } + yield break; } } } - - int variable_modification_isoforms = 0; - - foreach (Dictionary kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) + finally { - int numFixedMods = 0; - foreach (var ok in GetFixedModsOneIsNorFivePrimeTerminus(peptideLength, allKnownFixedModifications)) - { - if (!kvp.ContainsKey(ok.Key)) - { - numFixedMods++; - kvp.Add(ok.Key, ok.Value); - } - } - yield return new PeptideWithSetModifications(Protein, digestionParams, OneBasedStartResidue, OneBasedEndResidue, - CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, kvp, numFixedMods); - variable_modification_isoforms++; - if (variable_modification_isoforms == maximumVariableModificationIsoforms) - { - yield break; - } + FixedModDictionaryPool.Return(fixedModDictionary); + DictionaryPool.Return(twoBasedPossibleVariableAndLocalizeableModifications); } } - - /// - /// Determines whether given modification can be an N-terminal modification - /// - /// - /// - /// - private bool CanBeNTerminalMod(Modification variableModification, int peptideLength) - { - return ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, 1, peptideLength, OneBasedStartResidue) - && (variableModification.LocationRestriction == "N-terminal." || variableModification.LocationRestriction == "Peptide N-terminal."); - } - - /// - /// Determines whether given modification can be a C-terminal modification - /// - /// - /// - /// - private bool CanBeCTerminalMod(Modification variableModification, int peptideLength) - { - return ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, peptideLength, peptideLength, OneBasedStartResidue + peptideLength - 1) - && (variableModification.LocationRestriction == "C-terminal." || variableModification.LocationRestriction == "Peptide C-terminal."); - } } } \ No newline at end of file diff --git a/mzLib/Readers/Bruker/BrukerFileReader.cs b/mzLib/Readers/Bruker/BrukerFileReader.cs index 0c80151e9..ec943a691 100644 --- a/mzLib/Readers/Bruker/BrukerFileReader.cs +++ b/mzLib/Readers/Bruker/BrukerFileReader.cs @@ -378,7 +378,7 @@ private List GetFullStepsTable() /// SQLiteReader object, initialized after the execution of a command. /// Return null exception if there is an error in the data format of the baf file. /// - private T SqlColumnReader(SQLiteDataReader reader) where T: new() + public static T SqlColumnReader(SQLiteDataReader reader) where T: new() { // get all the property names, then iterate over that. // The objects should be exact 1:1 column corresponding so as @@ -516,7 +516,7 @@ private static void ThrowLastBaf2SqlError() } /* ----------------------------------------------------------------------------------------------- */ - private static byte[] ConvertStringToUTF8ByteArray(String input) + public static byte[] ConvertStringToUTF8ByteArray(String input) { byte[] utf8 = Encoding.UTF8.GetBytes(input); var result = new byte[utf8.Length + 1]; diff --git a/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs index 9e1a1a54e..d181e399e 100644 --- a/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs +++ b/mzLib/Readers/ExternalResults/BaseClasses/IQuantifiableResultFile.cs @@ -24,6 +24,6 @@ public interface IQuantifiableResultFile : IResultFile /// /// list of file paths associated with each distinct record /// Dictionary of file names and their associted full paths - public Dictionary FileNametoFilePath(List fullFilePath); + public Dictionary FileNameToFilePath(List fullFilePath); } } \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/ExperimentAnnotation.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/ExperimentAnnotation.cs new file mode 100644 index 000000000..93b1192b8 --- /dev/null +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/ExperimentAnnotation.cs @@ -0,0 +1,44 @@ +using CsvHelper.Configuration; +using CsvHelper.Configuration.Attributes; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers +{ + /// + /// A class representing a single entry in an experiment_annotation.tsv file + /// + public class ExperimentAnnotation + { + public static CsvConfiguration CsvConfiguration = new CsvConfiguration(CultureInfo.InvariantCulture) + { + Delimiter = "\t", + HasHeaderRecord = true, + IgnoreBlankLines = true, + TrimOptions = TrimOptions.Trim + }; + + #region experiment_annotation Fields + + [Name("file")] + public string File { get; set; } + + [Name("sample")] + public string Sample { get; set; } + + [Name("sample_name")] + public string SampleName { get; set; } + + [Name("condition")] + public string Condition { get; set; } + + [Name("replicate")] + public string Replicate { get; set; } + + #endregion + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/ResultFiles/ExperimentAnnotationFile.cs b/mzLib/Readers/ExternalResults/ResultFiles/ExperimentAnnotationFile.cs new file mode 100644 index 000000000..9a975b84d --- /dev/null +++ b/mzLib/Readers/ExternalResults/ResultFiles/ExperimentAnnotationFile.cs @@ -0,0 +1,54 @@ +using CsvHelper; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers +{ + /// + /// Concrete Product for reading and representing a experiment annotation file + /// + public class ExperimentAnnotationFile: ResultFile, IResultFile + { + public override SupportedFileType FileType => SupportedFileType.ExperimentAnnotation; + + public override Software Software { get; set; } + + public ExperimentAnnotationFile(string filePath) : base(filePath, Software.MsFragger) { } + + /// + /// Constructor used to initialize from the factory method + /// + public ExperimentAnnotationFile() : base() { } + + /// + /// Load Results to the Results List from the given filepath + /// + public override void LoadResults() + { + using var csv = new CsvReader(new StreamReader(FilePath), ExperimentAnnotation.CsvConfiguration); + Results = csv.GetRecords().ToList(); + } + + /// + /// Writes results to a specific output path + /// + /// destination path + public override void WriteResults(string outputPath) + { + if (!CanRead(outputPath)) + outputPath += FileType.GetFileExtension(); + + using var csv = new CsvWriter(new StreamWriter(File.Create(outputPath)), ExperimentAnnotation.CsvConfiguration); + + csv.WriteHeader(); + foreach (var result in Results) + { + csv.NextRecord(); + csv.WriteRecord(result); + } + } + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerCombinedResults.cs b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerCombinedResults.cs new file mode 100644 index 000000000..db00ed7b0 --- /dev/null +++ b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerCombinedResults.cs @@ -0,0 +1,171 @@ +using CsvHelper; +using Readers.ExternalResults.BaseClasses; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using System.IO; +using MathNet.Numerics; + +namespace Readers +{ + public class MsFraggerCombinedResults : ResultFile, IResultFile, IQuantifiableResultFile + { + #region Properties/Fields + + public string FullFolderPath => FilePath; // The full file path to the folder of MSFragger results + private List allPsmFilePaths; // List of the full file paths to the psm files of every sample + + // A list of all the MSFraggerPsmFile objects that correspond to each sample within an experiment + public List AllPsmFiles { get; private set; } + + // Contains descriptive information on every ms data file in the experiment (sample name, full path to the ms data file, etc.) + public ExperimentAnnotationFile ExperimentAnnotations { get; private set; } + + #endregion + + #region IResultFile Implementatation + + public override SupportedFileType FileType => SupportedFileType.MsFraggerPsm; + public override Software Software { get; set; } + public MsFraggerCombinedResults(string filePath) : base(filePath, Software.MsFragger) { } + + /// + /// Loads the results from each psm.tsv file in the results folder, builds one list of MsFraggerPsms, + /// and Calls LoadExperimentAnnotation, FindAllFilePaths, LoadPsmResults, + /// then selects every results from each MsFraggerPsmFile in AllPsmFiles and writes them to one concatenated list. + /// + public override void LoadResults() + { + LoadExperimentAnnotationResults(); + FindAllFilePaths(); + LoadPsmResults(); + + List concatList = new List(); + foreach (var file in AllPsmFiles) + { + concatList.AddRange(file); + } + + Results = concatList; + } + + public override void WriteResults(string outputPath) + { + throw new NotImplementedException("Method not yet implemented."); + } + + #endregion + + /// + /// Checks for existence of experiment annotation file and loads its it as an ExperimentAnnotationResultFile, + /// then sets the ExperimentAnnotations property + /// + /// + public void LoadExperimentAnnotationResults() + { + string combinedFilePath = Path.Combine(FullFolderPath, "experiment_annotation.tsv"); + if (!File.Exists(combinedFilePath)) { throw new FileNotFoundException("The experiment_annotation.tsv file was not found"); } + + ExperimentAnnotations = new ExperimentAnnotationFile(combinedFilePath); + } + + /// + /// For each path in AllPsmFilePaths, creates and loads an MsFraggerPsmFile. + /// Then constructs the AllPsmFiles list + /// + public void LoadPsmResults() + { + AllPsmFiles = new List(); + + foreach(var path in allPsmFilePaths) + { + MsFraggerPsmFile file = new MsFraggerPsmFile(path); + AllPsmFiles.Add(file); + } + } + + public IEnumerable GetQuantifiableResults() => Results; + + /// + /// Links the file name associated with the an IQuantifiableRecord + /// to the raw file path of MassSpec data in the fullFilePath list + /// + /// list of file paths associated with each distinct record + /// Dictionary of file names and their associted full paths + public Dictionary FileNameToFilePath(List filePaths) + { + Dictionary allFiles = new Dictionary(); + + allFiles = AllPsmFiles.Select(file => file.FileNameToFilePath(filePaths)) + .SelectMany(dictionary => dictionary) + .GroupBy(x => x.Key) + .Select(keyValuePair => keyValuePair.First()) + .ToDictionary(fileName => fileName.Key, filePath => filePath.Value); + + return allFiles; + } + + /// + /// Links the file name associated with IQuantifiableRecord to the raw file path pf MassSpec file + /// using the full file paths from the experiment annotation file. + /// + /// Dictionary of file names and their associted full paths + public Dictionary FileNameToFilePath() + { + List filePaths = ExperimentAnnotations.Select(psm => psm.File).Distinct().ToList(); + List fileNames = Results.Select(psm => psm.FileName).Distinct().ToList(); + Dictionary allFiles = new Dictionary(); + + foreach (var name in fileNames) + { + string fileName = Path.GetFileName(name); + + // MSFragger results append the raw file with "interact-" and replace .raw with .pep.xml + // In order to correctly match the file names, these changes must be removed + fileName = fileName.Replace("interact-", "").Replace(".pep.xml", ""); + + foreach (var path in filePaths) + { + if (path.Contains(fileName) && !allFiles.ContainsKey(name)) + { + allFiles.Add(name, path); + break; + } + } + } + + return allFiles; + } + + /// + /// Uses the ExperimentAnnotations to locate each psm.tsv file in the results folder. + /// Adds the path to each psm.tsv file in the results folder to AllPsmFilePaths + /// + /// + private void FindAllFilePaths() + { + allPsmFilePaths = new List(); + + List sampleNames = ExperimentAnnotations.Select(psm => psm.SampleName).Distinct().ToList(); + string[] directoryEntries = Directory.GetDirectories(FullFolderPath); + + foreach (var directoryEntry in directoryEntries) + { + string directoryName = Path.GetFileName(directoryEntry.TrimEnd(Path.DirectorySeparatorChar)); + + foreach (var sample in sampleNames) + { + if (directoryName.Equals(sample)) + { + string psmFile = Path.Combine(directoryEntry, "psm.tsv"); + if (!File.Exists(psmFile)) { throw new FileNotFoundException("This psm.tsv file was not found"); } + + allPsmFilePaths.Add(psmFile); + } + } + } + } + } +} \ No newline at end of file diff --git a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs index 1aa80f885..f15a2d909 100644 --- a/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs +++ b/mzLib/Readers/ExternalResults/ResultFiles/MsFraggerPsmFile.cs @@ -48,7 +48,7 @@ public override void WriteResults(string outputPath) /// /// list of all full file paths associted with a given result /// dictionary with key fileName and value fullFilePath - public Dictionary FileNametoFilePath (List fullFilePath) + public Dictionary FileNameToFilePath (List fullFilePath) { List rawFileNames = Results.Select(psm => psm.FileName).Distinct().ToList(); fullFilePath = fullFilePath.Distinct().ToList(); diff --git a/mzLib/Readers/MsDataFileReader.cs b/mzLib/Readers/MsDataFileReader.cs index ff577f85b..91c801235 100644 --- a/mzLib/Readers/MsDataFileReader.cs +++ b/mzLib/Readers/MsDataFileReader.cs @@ -20,6 +20,7 @@ public static MsDataFile GetDataFile(string filePath) SupportedFileType.MzML => new Mzml(filePath), SupportedFileType.Mgf => new Mgf(filePath), SupportedFileType.BrukerD => new BrukerFileReader(filePath), + SupportedFileType.BrukerTimsTof => new TimsTofFileReader(filePath), _ => throw new MzLibException("File type not supported"), }; } diff --git a/mzLib/Readers/Readers.csproj b/mzLib/Readers/Readers.csproj index 076ed9e40..ab22f2a05 100644 --- a/mzLib/Readers/Readers.csproj +++ b/mzLib/Readers/Readers.csproj @@ -1,10 +1,11 @@ - + net8.0 x64 enable enable + true @@ -12,6 +13,7 @@ + @@ -21,7 +23,6 @@ - Thermo\ThermoFisher.CommonCore.BackgroundSubtraction.dll @@ -70,6 +71,18 @@ Always + + PreserveNewest + + + PreserveNewest + + + Always + + + Always + diff --git a/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs b/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs index 62a720c63..709b391ba 100644 --- a/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs +++ b/mzLib/Readers/SearchResults/SpectrumMatchTsvReader.cs @@ -28,7 +28,7 @@ public static List ReadTsv(string filePath, out List /// Gets all the MS orders of all scans in a dynamic connection. This is useful if you want to open all MS1 scans /// without loading all of the other MSn scans. diff --git a/mzLib/Readers/Util/SupportedFileTypes.cs b/mzLib/Readers/Util/SupportedFileTypes.cs index e20cd60a1..a960cb794 100644 --- a/mzLib/Readers/Util/SupportedFileTypes.cs +++ b/mzLib/Readers/Util/SupportedFileTypes.cs @@ -12,7 +12,6 @@ public enum SupportedFileType ThermoRaw, MzML, Mgf, - BrukerD, psmtsv, //osmtsv ToppicPrsm, @@ -26,7 +25,10 @@ public enum SupportedFileType MsPathFinderTTargets, MsPathFinderTDecoys, MsPathFinderTAllResults, - CruxResult + CruxResult, + ExperimentAnnotation, + BrukerD, + BrukerTimsTof } public static class SupportedFileTypeExtensions @@ -50,6 +52,7 @@ public static string GetFileExtension(this SupportedFileType type) SupportedFileType.MzML => ".mzML", SupportedFileType.Mgf => ".mgf", SupportedFileType.BrukerD => ".d", + SupportedFileType.BrukerTimsTof => ".d", SupportedFileType.psmtsv => ".psmtsv", //SupportedFileType.osmtsv => ".osmtsv", SupportedFileType.ToppicPrsm => "_prsm.tsv", @@ -64,6 +67,7 @@ public static string GetFileExtension(this SupportedFileType type) SupportedFileType.MsPathFinderTDecoys => "_IcDecoy.tsv", SupportedFileType.MsPathFinderTAllResults => "_IcTDA.tsv", SupportedFileType.CruxResult => ".txt", + SupportedFileType.ExperimentAnnotation => "experiment_annotation.tsv", _ => throw new MzLibException("File type not supported") }; } @@ -74,7 +78,14 @@ public static SupportedFileType ParseFileType(this string filePath) case ".raw": return SupportedFileType.ThermoRaw; case ".mzml": return SupportedFileType.MzML; case ".mgf": return SupportedFileType.Mgf; - case ".d": return SupportedFileType.BrukerD; + case ".d": + if(!Directory.Exists(filePath)) throw new FileNotFoundException(); + var fileList = Directory.GetFiles(filePath).Select(p => Path.GetFileName(p)); + if (fileList.Any(file => file == "analysis.baf")) + return SupportedFileType.BrukerD; + if (fileList.Any(file => file == "analysis.tdf")) + return SupportedFileType.BrukerTimsTof; + throw new MzLibException("Bruker file type not recognized"); case ".psmtsv": return SupportedFileType.psmtsv; //case ".osmtsv": return SupportedFileType.osmtsv; case ".feature": @@ -116,6 +127,8 @@ public static SupportedFileType ParseFileType(this string filePath) return SupportedFileType.MsPathFinderTDecoys; if (filePath.EndsWith(SupportedFileType.MsPathFinderTAllResults.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) return SupportedFileType.MsPathFinderTAllResults; + if(filePath.EndsWith(SupportedFileType.ExperimentAnnotation.GetFileExtension(), StringComparison.InvariantCultureIgnoreCase)) + return SupportedFileType.ExperimentAnnotation; // these tsv cases are just .tsv and need an extra step to determine the type // currently need to distinguish between FlashDeconvTsv and MsFraggerPsm diff --git a/mzLib/Readers/timsTOF/FrameProxy.cs b/mzLib/Readers/timsTOF/FrameProxy.cs new file mode 100644 index 000000000..00568e3c7 --- /dev/null +++ b/mzLib/Readers/timsTOF/FrameProxy.cs @@ -0,0 +1,334 @@ +using MassSpectrometry; +using System.Runtime.InteropServices; + +namespace Readers +{ + /// + /// Factory class for creating FrameProxy instances and managing frame-related data. + /// + internal class FrameProxyFactory + { + internal FrameTable FramesTable { get; } + internal UInt64 FileHandle { get; } + internal Object FileLock { get; } + internal TimsConversion Converter { get; } + public int MaxIndex { get; init; } + /// + /// Used to convert the tofIndices stored in the .d file to m/z values + /// + public double[] MzLookupArray { get; set; } + /// + /// Used to convert scan number to 1/K0 values + /// + public double[] OneOverK0LookupArray { get; set; } + + internal FrameProxyFactory(FrameTable table, UInt64 fileHandle, Object fileLock, int maxIndex) + { + FramesTable = table; + FileHandle = fileHandle; + FileLock = fileLock; + Converter = new TimsConversion(fileHandle, fileLock); + MaxIndex = maxIndex; + InitializeLookupTables(fileHandle); + } + + internal FrameProxy GetFrameProxy(long frameId) + { + return new FrameProxy(FileHandle, frameId, FramesTable.NumScans[frameId - 1], FileLock, Converter); + } + + internal double[] ConvertIndicesToMz(IList indices) + { + double[] mzArray = new double[indices.Count()]; + for (int idx = 0; idx < indices.Count(); idx++) + { + if (indices[idx] >= MzLookupArray.Length) + throw new ArgumentException("Index out of range"); + mzArray[idx] = MzLookupArray[indices[idx]]; + } + return mzArray; + } + + /// + /// Accesses the file, then stores the index to m/z lookup in the mzLookup array + /// and the index to 1/k0 lookup in the OneOverK0LookupArray + /// + /// + internal void InitializeLookupTables(ulong handle) + { + uint[] lArray = new uint[MaxIndex]; + for (uint i = 0; i < MaxIndex; i++) + { + lArray[i] = i; + } + + // Each frame technically has slightly different index --> m/z mapping + // but in conversations with Sander Willem, I was told that the differences are negligible + // so we can use the median frame to generate the lookup table + long medianFrameId = FramesTable.OneBasedFrameIndex[FramesTable.OneBasedFrameIndex.Length / 2]; + + // Populate the mzLookupArray + double[] mzLookupIndices = Array + .ConvertAll(lArray, entry => (double)entry); + MzLookupArray = Converter.DoTransformation(handle, medianFrameId, mzLookupIndices, ConversionFunctions.IndexToMz); + + // Populate the 1/K0 lookup array + int scanMax = FramesTable.NumScans.Max(); + double[] oneOverK0LookupIndices = Array + .ConvertAll(Enumerable.Range(0, scanMax).ToArray(), entry => (double)entry); + OneOverK0LookupArray = Converter.DoTransformation(handle, medianFrameId, oneOverK0LookupIndices, ConversionFunctions.ScanToOneOverK0); + } + + internal Polarity GetPolarity(long frameId) + { + return FramesTable.Polarity[frameId - 1] == '+' ? Polarity.Positive : Polarity.Negative; + } + + internal double GetOneOverK0(double medianScanNumber) + { + // The lookup array is 0-indexed, so we need to subtract 1 from the scan number + if (medianScanNumber % 1 == 0) + return OneOverK0LookupArray[(int)medianScanNumber - 1]; + else + { + int floor = (int)Math.Floor(medianScanNumber); + int ceil = (int)Math.Ceiling(medianScanNumber); + return (OneOverK0LookupArray[floor - 1] + OneOverK0LookupArray[ceil - 1]) / 2; + } + } + + internal double GetRetentionTime(long frameId) + { + return (double)FramesTable.RetentionTime[frameId - 1]; + } + + internal double GetInjectionTime(long frameId) + { + return FramesTable.FillTime[frameId - 1]; + } + + internal double GetInjectionTimeSum(long firstFrameId, long lastFrameId) + { + double injectionTimeSum = 0; + for(long i = firstFrameId; i <= lastFrameId; i++) + { + injectionTimeSum += FramesTable.FillTime[i - 1]; + } + return injectionTimeSum; + } + } + + /// + /// Proxy class for accessing frame data. Each FrameProxy stores the raw information collected across all + /// ~1000 scans that make up a frame + /// + internal class FrameProxy + { + private int[] _scanOffsets; // Number of peaks that precede a given scan in a frame + /// + /// This is one huge array that stores ALLLL the information for the frame. + /// Specific scans are accessed by determining the number of data points that were collected + /// before the scan took place, then jumping forward by that amount to get the data for that scan + /// + public uint[] _rawData; + /// + /// default size for the raw data array + /// + private const int _defaultBufferSize = 4096; + internal UInt64 FileHandle { get; } + internal long FrameId { get; } + internal int NumberOfScans { get; } + internal TimsConversion Converter { get; } + + internal FrameProxy(UInt64 fileHandle, long frameId, int numScans, Object fileLock, TimsConversion converter) + { + NumberOfScans = numScans; + FileHandle = fileHandle; + FrameId = frameId; + Converter = converter; + + _rawData = GetScanRawData(fileHandle, frameId, (uint)numScans, fileLock); + _scanOffsets = PartialSum(_rawData, 0, numScans); + } + + /// + /// Gets the intensities for the specified scan. + /// + /// Zero-indexed scan number. + /// Array of intensities. + internal int[] GetScanIntensities(int zeroIndexedScanNumber) + { + return Array.ConvertAll(_rawData[GetYRange(zeroIndexedScanNumber)], entry => (int)entry); + } + + /// + /// Gets the indices for the specified scan. + /// + /// Zero-indexed scan number. + /// Array of indices. + internal uint[] GetScanIndices(int zeroIndexedScanNumber) + { + return _rawData[GetXRange(zeroIndexedScanNumber)]; + } + + /// + /// Read a range of scans from a single frame. + /// + /// Output layout: (N = scan_end - scan_begin = number of requested scans) + /// N x uint32_t: number of peaks in each of the N requested scans + /// N x (two uint32_t arrays: first indices, then intensities) + /// + /// Note: different threads must not read scans from the same storage handle + /// concurrently. + /// + internal static uint[] GetScanRawData(UInt64 fileHandle, long frameId, UInt32 numScans, Object fileLock) + { + int bufferSize = _defaultBufferSize; + // buffer expansion loop + while (true) + { + IntPtr pData = Marshal.AllocHGlobal(bufferSize * Marshal.SizeOf()); + try + { + uint outputLength; + + lock (fileLock) + { + outputLength = tims_read_scans_v2( + fileHandle, + frameId, + scan_begin: 0, + scan_end: numScans, + buffer: pData, + length: (uint)(bufferSize * 4)); + } + + if (4 * bufferSize > outputLength) + { + var dataArray = new uint[bufferSize]; + CopyToManaged(pData, dataArray, 0, bufferSize); + + return dataArray; + } + + if (outputLength > 16777216) // Arbitrary 16 mb frame limit + { + throw new Exception("Maximum frame size exceeded"); + } + + // Increase buffer size if necessary + bufferSize = ((int)outputLength / 4) + 1; + } + finally{ Marshal.FreeHGlobal(pData); } + } + } + + /// + /// Returns a range containing the start(inclusive) and end (exclusive) indices + /// for the segment of the _rawData array corresponding to the m/z lookup values for + /// a given scan + /// + /// Throws exception if scan number out of range + internal Range GetXRange(int zeroIndexedScanNumber) + { + ThrowIfInvalidScanNumber(zeroIndexedScanNumber); + return GetScanRange(zeroIndexedScanNumber, offset: 0); + } + + /// + /// Returns a range containing the start(inclusive) and end (exclusive) indices + /// for the segment of the _rawData array corresponding to raw intensity values for a given scan + /// + internal Range GetYRange(int zeroIndexedScanNumber) + { + ThrowIfInvalidScanNumber(zeroIndexedScanNumber); + return GetScanRange(zeroIndexedScanNumber, offset: (int)_rawData[zeroIndexedScanNumber]); + } + + /// Throws exception if scan number out of range + private void ThrowIfInvalidScanNumber(int zeroIndexedScanNumber) + { + if (zeroIndexedScanNumber < 0 || zeroIndexedScanNumber >= NumberOfScans) + throw new ArgumentException("Scan number out of range."); + } + + private Range GetScanRange(int zeroIndexedScanNumber, int offset) + { + int start = NumberOfScans + 2*_scanOffsets[zeroIndexedScanNumber] + offset; + return new Range(start, start + (int)_rawData[zeroIndexedScanNumber]); + } + + /// + /// Calculates the running total of an array, beginning with + /// the start index (inclusive) and ending with the end index (exclusive). + /// Used for determining scan offsets. + /// + /// Array to be summed + /// Where to begin summing + /// Where summing ends (exclusive) + /// An array of length (end - start) containing the + /// partial sums at each index of the input array + public static int[] PartialSum(uint[] array, int start, int end) + { + int runningTotal = 0; + int[] sums = new int[end - start + 1]; + sums[0] = 0; + + for(int i = 0; i < end; i++) + { + runningTotal += (int)array[i]; + sums[i+1] = runningTotal; + } + return sums; + } + + /// + /// This is reimplementation of the Marshal.Copy method that allows for arbitrary types + /// + /// + /// + /// + /// + /// + /// + /// + internal static unsafe void CopyToManaged(IntPtr source, T[] destination, int startIndex, int length) + { + if (source == IntPtr.Zero) throw new ArgumentNullException(nameof(source)); + if (destination is null) throw new ArgumentNullException(nameof(destination)); + if (startIndex < 0) throw new ArgumentOutOfRangeException(nameof(startIndex)); + if (length < 0) throw new ArgumentOutOfRangeException(nameof(length)); + + void* sourcePtr = (void*)source; + Span srcSpan = new Span(sourcePtr, length); + Span destSpan = new Span(destination, startIndex, length); + + srcSpan.CopyTo(destSpan); + } + + + /// + /// Read a range of scans from a single frame. + /// + /// Output layout: (N = scan_end - scan_begin = number of requested scans) + /// N x uint32_t: number of peaks in each of the N requested scans + /// N x (two uint32_t arrays: first indices, then intensities) + /// + /// Note: different threads must not read scans from the same storage handle + /// concurrently. + /// + /// Unique Handle of .d file ( returned on tims_open() ) + /// From .tdf SQLite: Frames.Id + /// first scan number to read (inclusive) + /// Last scan number (exclusive) + /// Destination buffer allocated by user + /// Length of the buffer (in bytes, i.e. 4 * buffer.length) + /// 0 on error, otherwise the number of buffer bytes necessary for the output + /// of this call (if this is larger than the provided buffer length, the result is not + /// complete). + [DllImport("timsdata.dll", CallingConvention = CallingConvention.Cdecl)] + unsafe static extern UInt32 tims_read_scans_v2 + (UInt64 handle, Int64 frame_id, UInt32 scan_begin, UInt32 scan_end, IntPtr buffer, UInt32 length); + + } +} diff --git a/mzLib/Readers/timsTOF/Records.cs b/mzLib/Readers/timsTOF/Records.cs new file mode 100644 index 000000000..34b779ae9 --- /dev/null +++ b/mzLib/Readers/timsTOF/Records.cs @@ -0,0 +1,68 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Readers +{ + internal readonly struct Ms1Record + { + internal int PrecursorId { get; } + internal int ScanStart { get; } + internal int ScanEnd { get; } + internal double ScanMedian { get; } + + public Ms1Record(int precursorId, int scanStart, int scanEnd, double scanMedian) + { + PrecursorId = precursorId; + ScanStart = scanStart; + ScanEnd = scanEnd; + ScanMedian = scanMedian; + } + } + + internal readonly struct PasefRecord + { + internal IEnumerable FrameList { get; } + internal int PrecursorId { get; } + internal int ScanStart { get; } + internal int ScanEnd { get; } + internal double ScanMedian { get; } + internal float IsolationMz { get; } + internal float IsolationWidth { get; } + internal float CollisionEnergy { get; } + internal float MostAbundantPrecursorMz { get; } + internal float PrecursorMonoisotopicMz { get; } + internal int Charge { get; } + internal float PrecursorIntensity { get; } + + public PasefRecord( + IEnumerable frameList, + int precursorId, + int scanStart, + int scanEnd, + double scanMedian, + float isolationMz, + float isolationWidth, + float collisionEnergy, + float mostAbundantPrecursorMz, + float precursorMonoisotopicMz, + int charge, + float precursorIntensity) + { + FrameList = frameList ?? throw new ArgumentNullException(nameof(frameList)); + PrecursorId = precursorId; + ScanStart = scanStart; + ScanEnd = scanEnd; + ScanMedian = scanMedian; + IsolationMz = isolationMz; + IsolationWidth = isolationWidth; + CollisionEnergy = collisionEnergy; + MostAbundantPrecursorMz = mostAbundantPrecursorMz; + PrecursorMonoisotopicMz = precursorMonoisotopicMz; + Charge = charge; + PrecursorIntensity = precursorIntensity; + } + } +} diff --git a/mzLib/Readers/timsTOF/Tables.cs b/mzLib/Readers/timsTOF/Tables.cs new file mode 100644 index 000000000..6e475be39 --- /dev/null +++ b/mzLib/Readers/timsTOF/Tables.cs @@ -0,0 +1,91 @@ +using MzLibUtil; +using System.Data.SQLite; + +namespace Readers +{ + internal enum TimsTofMsMsType + { + MS = 0, + MSMSFragment = 2, + PASEF = 8, + DIA = 9, + PRM = 10 + } + + internal enum TimsTofAcquisitionMode + { + MS = 0, + AutoMSMS = 1, + MRM = 2, + inSourceCID = 3, + broadbandCID = 4, + PASEF = 8, + DIA = 9, + PRM = 10, + Maldi = 20 + } + + /// + /// This class stores information take from the .tdf SQLite database file + /// Every frame in the file has 9 pieces of metadata that can be accessed by + /// selecting the appropriate array. All arrays are zero-based!!! + /// EX: ScanMode[0] will return the scan mode of the first frame (FrameID = 1) in the file + /// + internal class FrameTable + { + internal long[] OneBasedFrameIndex { get; } + internal char[] Polarity { get; } + internal int[] NumScans { get; } + internal int[] ScanMode { get; } + internal int[] MsMsType { get; } + internal int[] TotalNumberOfPeaks { get; } + internal int[] TotalIntensity { get; } + internal float[] RetentionTime { get; } + internal float[] FillTime { get; } + + internal TimsTofMsMsType GetAnalysisType(int frameId) + { + if (frameId == 0 || frameId > MsMsType.Length) throw new IndexOutOfRangeException("Invalid frame ID!"); + if (MsMsType[frameId - 1].ToEnum(out var analysisType)) + return analysisType; + else + throw new MzLibException("Unrecognized MS/MS method."); + } + + internal FrameTable(SQLiteConnection connection, int numberOfRows) + { + using var command = new SQLiteCommand(connection); + command.CommandText = @"SELECT f.Id, f.Polarity, f.NumScans," + + " f.ScanMode, f.MsMsType, f.NumPeaks, f.SummedIntensities," + + " f.Time, f.AccumulationTime FROM Frames f;"; + using var reader = command.ExecuteReader(); + + OneBasedFrameIndex = new long[numberOfRows]; + Polarity = new char[numberOfRows]; + NumScans = new int[numberOfRows]; + ScanMode = new int[numberOfRows]; + MsMsType = new int[numberOfRows]; + TotalNumberOfPeaks = new int[numberOfRows]; + TotalIntensity = new int[numberOfRows]; + RetentionTime = new float[numberOfRows]; + FillTime = new float[numberOfRows]; + + // Populate arrays by reading in the table + for (int i = 0; i < numberOfRows; i++) + { + if (!reader.Read()) break; + OneBasedFrameIndex[i] = reader.GetInt64(0); + Polarity[i] = reader.GetString(1)[0]; + NumScans[i] = reader.GetInt32(2); + ScanMode[i] = reader.GetInt32(3); + MsMsType[i] = reader.GetInt32(4); + TotalNumberOfPeaks[i] = reader.GetInt32(5); + TotalIntensity[i] = reader.GetInt32(6); + RetentionTime[i] = reader.GetFloat(7); + FillTime[i] = reader.GetFloat(8); + } + + } + + } +} diff --git a/mzLib/Readers/timsTOF/TimsConversion.cs b/mzLib/Readers/timsTOF/TimsConversion.cs new file mode 100644 index 000000000..b4b766fc6 --- /dev/null +++ b/mzLib/Readers/timsTOF/TimsConversion.cs @@ -0,0 +1,108 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Reflection.Metadata; +using System.Runtime.InteropServices; +using System.Text; +using System.Threading.Tasks; + +namespace Readers +{ + + internal enum ConversionFunctions + { + IndexToMz, + MzToIndex, + ScanToOneOverK0, + OneOverK0ToScan, + ScanToVoltage, + VoltageToScan + } + + internal unsafe class TimsConversion + { + + private UInt64 _fileHandle; + private Object _fileLock; + + internal TimsConversion(UInt64 fileHandle, Object fileLock) + { + _fileHandle = fileHandle; + _fileLock = fileLock; + } + + /// + /// Takes an array of raw values and converts them according to the specified conversion function, + /// returning an equal length array containing the transformed values + /// + /// Unique identifier associated with the open timsTof .d data file + /// Frame identified + /// Double array containing the transformed input values + internal unsafe double[] DoTransformation(UInt64 fileHandle, long frameId, double[] input, ConversionFunctions function) + { + if(!input.IsNotNullOrEmpty()) + { + return Array.Empty(); + } + double[] transformedValues = new double[input.Length]; + fixed (double* inputPtr = &input[0]) + { + IntPtr outPtr = Marshal.AllocHGlobal(input.Length * Marshal.SizeOf()); + try + { + lock (_fileLock) + { + switch (function) + { + case ConversionFunctions.IndexToMz: + tims_index_to_mz(fileHandle, frameId, inputPtr, (double*)outPtr, (UInt32)input.Length); + break; + case ConversionFunctions.MzToIndex: + tims_mz_to_index(fileHandle, frameId, inputPtr, (double*)outPtr, (UInt32)input.Length); + break; + case ConversionFunctions.ScanToOneOverK0: + tims_scannum_to_oneoverk0(fileHandle, frameId, inputPtr, (double*)outPtr, (UInt32)input.Length); + break; + case ConversionFunctions.OneOverK0ToScan: + tims_oneoverk0_to_scannum(fileHandle, frameId, inputPtr, (double*)outPtr, (UInt32)input.Length); + break; + case ConversionFunctions.ScanToVoltage: + tims_scannum_to_voltage(fileHandle, frameId, inputPtr, (double*)outPtr, (UInt32)input.Length); + break; + case ConversionFunctions.VoltageToScan: + tims_voltage_to_scannum(fileHandle, frameId, inputPtr, (double*)outPtr, (UInt32)input.Length); + break; + default: + break; + + } + } + + Marshal.Copy(outPtr, transformedValues, 0, input.Length); + } + finally { Marshal.FreeHGlobal(outPtr); } + } + return transformedValues; + } + + [DllImport("timsdata.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)] + unsafe static extern void tims_index_to_mz + (UInt64 fileHandle, Int64 frame_id, double* inputPtr, double* outPtr, UInt32 count); + [DllImport("timsdata.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)] + unsafe static extern void tims_mz_to_index + (UInt64 fileHandle, Int64 frame_id, double* inputPtr, double* outPtr, UInt32 count); + [DllImport("timsdata.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)] + unsafe static extern void tims_scannum_to_oneoverk0 + (UInt64 fileHandle, Int64 frame_id, double* inputPtr, double* outPtr, UInt32 count); + [DllImport("timsdata.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)] + unsafe static extern void tims_oneoverk0_to_scannum + (UInt64 fileHandle, Int64 frame_id, double* inputPtr, double* outPtr, UInt32 count); + [DllImport("timsdata.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)] + unsafe static extern void tims_scannum_to_voltage + (UInt64 fileHandle, Int64 frame_id, double* inputPtr, double* outPtr, UInt32 count); + [DllImport("timsdata.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)] + unsafe static extern void tims_voltage_to_scannum + (UInt64 fileHandle, Int64 frame_id, double* inputPtr, double* outPtr, UInt32 count); + } +} diff --git a/mzLib/Readers/timsTOF/TimsDataScan.cs b/mzLib/Readers/timsTOF/TimsDataScan.cs new file mode 100644 index 000000000..2a4b8c7fa --- /dev/null +++ b/mzLib/Readers/timsTOF/TimsDataScan.cs @@ -0,0 +1,93 @@ +using MzLibUtil; +using Readers; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace MassSpectrometry +{ + public class TimsDataScan : MsDataScan + { + public int ScanNumberStart { get; } + public int ScanNumberEnd { get; } + public double OneOverK0 { get; } + public int? PrecursorId { get; } + public long FrameId { get; } + /// + /// For PASEF Aggregate scans, contains the list of Frames where the same precursor was samples + /// This is a list of succesive PASEF scans capturing data on the same ion-mobility scan range and quadrupole isolation window + /// + public List FrameIds { get; } + internal int ComponentSpectraTotalPeaks { get; private set; } + + // Need to incorporate scan range somehow + public TimsDataScan(MzSpectrum massSpectrum, + int oneBasedScanNumber, + int msnOrder, + bool isCentroid, + Polarity polarity, + double retentionTime, + MzRange scanWindowRange, + string scanFilter, + MZAnalyzerType mzAnalyzer, + double totalIonCurrent, + double? injectionTime, + double[,] noiseData, + string nativeId, + long frameId, + int scanNumberStart, + int scanNumberEnd, + double medianOneOverK0, + int? precursorId = null, + double? selectedIonMz = null, + int? selectedIonChargeStateGuess = null, + double? selectedIonIntensity = null, + double? isolationMZ = null, + double? isolationWidth = null, + DissociationType? dissociationType = null, + int? oneBasedPrecursorScanNumber = null, + double? selectedIonMonoisotopicGuessMz = null, + string hcdEnergy = null, + List frames = null) : + base(massSpectrum, oneBasedScanNumber, msnOrder, isCentroid, polarity, + retentionTime, scanWindowRange, scanFilter, mzAnalyzer, totalIonCurrent, + injectionTime, noiseData, nativeId, selectedIonMz, selectedIonChargeStateGuess, + selectedIonIntensity, isolationMZ, isolationWidth, dissociationType, + oneBasedPrecursorScanNumber, selectedIonMonoisotopicGuessMz, hcdEnergy) + { + FrameId = frameId; + FrameIds = frames; + ScanNumberStart = scanNumberStart; + ScanNumberEnd = scanNumberEnd; + OneOverK0 = medianOneOverK0; + PrecursorId = precursorId; + ComponentSpectraTotalPeaks = 0; + } + + internal void AverageComponentSpectra(FrameProxyFactory proxyFactory, FilteringParams filteringParams = null) + { + MassSpectrum = TofSpectraMerger.MergeArraysToMs2Spectrum(mzArrays, intensityArrays, filteringParams); + TotalIonCurrent = MassSpectrum.SumOfAllY; + mzArrays.Clear(); + intensityArrays.Clear(); + } + + internal List mzArrays; + internal List intensityArrays; + + internal void AddComponentArrays(double[] mzs, int[] intensities) + { + if (mzArrays == null) + { + mzArrays = new(); + intensityArrays = new(); + } + mzArrays.Add(mzs); + intensityArrays.Add(intensities); + } + + + } +} diff --git a/mzLib/Readers/timsTOF/TimsTofFileReader.cs b/mzLib/Readers/timsTOF/TimsTofFileReader.cs new file mode 100644 index 000000000..8875c89eb --- /dev/null +++ b/mzLib/Readers/timsTOF/TimsTofFileReader.cs @@ -0,0 +1,571 @@ +using System; +using System.Runtime.InteropServices; +using System.Text; +using MassSpectrometry; +using System.Data.SQLite; +using Easy.Common.Extensions; +using MzLibUtil; +using UsefulProteomicsDatabases; +using System.Data.Common; +using Readers; +using System.Data.SqlClient; +using System.Data; +using ThermoFisher.CommonCore.Data.Business; +using Polarity = MassSpectrometry.Polarity; +using System.Security.AccessControl; +using System.Collections.Concurrent; +using System.Diagnostics; +using System.Security.Permissions; +using System.ComponentModel; + +namespace Readers +{ + public class TimsTofFileReader : MsDataFile, IDisposable + { + // timsTOF instruments collect frames, packets of ions collected by the tims, then analyzed + // over multiple scans with each scan corresponding to the same retention time but different + // ion mobility valuess. When reading the file, multiple scans from the same frame are collapsed into + // a single spectrum + + public TimsTofFileReader(string filePath) : base (filePath) { } + + private UInt64? _fileHandle; + private Object _fileLock; + private SQLiteConnection? _sqlConnection; + private int _maxThreads; + public int NumberOfFrames { get; private set; } + public List Ms1FrameIds { get; private set; } + internal FrameProxyFactory FrameProxyFactory { get; private set; } + + // I don't know what the default scan range is, and at this point I'm too afraid to ask... + private MzRange? _scanWindow; + public MzRange ScanWindow => _scanWindow ??= new MzRange(20, 2000); + public const string ScanFilter = "f"; + + public override void InitiateDynamicConnection() + { + if (!File.Exists(FilePath + @"\analysis.tdf") | !File.Exists(FilePath + @"\analysis.tdf_bin")) + { + throw new FileNotFoundException("Data file is missing .tdf and/or .tdf_bin file"); + } + + OpenSqlConnection(); + + if(_fileHandle != null) tims_close((UInt64)_fileHandle); + OpenBinaryFileConnection(); + _fileLock = new(); + + CountFrames(); + BuildProxyFactory(); + } + + internal void OpenSqlConnection() + { + if (_sqlConnection?.State == ConnectionState.Open) + return; + + _sqlConnection = new SQLiteConnection("Data Source=" + + Path.Combine(FilePath, "analysis.tdf") + + "; Version=3"); + try + { + _sqlConnection.Open(); + } + catch (Exception e) + { + throw new MzLibException("Error opening the .tdf file: " + e.Message); + } + } + + internal void OpenBinaryFileConnection() + { + byte[] binaryFileBytePath = BrukerFileReader.ConvertStringToUTF8ByteArray(FilePath); + _fileHandle = tims_open(binaryFileBytePath, 0); + if (_fileHandle == null || _fileHandle == 0) + throw new MzLibException("Could not open the analysis.tdf_bin file"); + } + + public override void CloseDynamicConnection() + { + if (_sqlConnection?.State == ConnectionState.Open) _sqlConnection.Close(); + _sqlConnection?.Dispose(); + if (_fileHandle != null) + { + tims_close((UInt64)_fileHandle); + _fileHandle = null; + } + } + + public void Dispose() + { + CloseDynamicConnection(); + } + + /// + /// WARNING! This method reads in the entire data file before + /// returning the requested scan! It is recommended to call the + /// GetScanFromPrecursorAndFrameIdFromDynamicConnection() + /// + public override MsDataScan GetOneBasedScanFromDynamicConnection(int oneBasedScanNumber, IFilteringParams filterParams = null) + { + if(oneBasedScanNumber <= 0) + throw new IndexOutOfRangeException("Invalid one-based index given when accessing data scans. Index: " + oneBasedScanNumber); + if (Scans != null && Scans.Length >= oneBasedScanNumber && Scans[oneBasedScanNumber - 1] != null) + return Scans[oneBasedScanNumber - 1]; + + LoadAllStaticData(filteringParams: (FilteringParams)filterParams); + if (oneBasedScanNumber > Scans.Length) + throw new IndexOutOfRangeException("Invalid one-based index given when accessing data scans. Index: " + oneBasedScanNumber); + return Scans[oneBasedScanNumber - 1]; + } + + /// + /// Returns a TimsDataScan with the specified frame and precursor id + /// WARNING! The returned data scan will have a OneBasedScanNumber of -1 + /// + /// + /// + /// + /// + /// + /// + /// + public TimsDataScan GetScanFromPrecursorAndFrameIdFromDynamicConnection(int precursorId, int frameId, IFilteringParams filteringParams = null) + { + if(_fileHandle == null || _fileHandle == 0 || _sqlConnection.IsCanceled() || FrameProxyFactory == null) + { + throw new MzLibException("The dynamic connection has not been created yet!"); + } + + var frameType = FrameProxyFactory.FramesTable.GetAnalysisType(frameId); + switch(frameType) + { + case TimsTofMsMsType.MS: + var records = GetMs1Records(frameId); + var recordForPrecursor = records.FirstOrDefault(x => x.PrecursorId == precursorId); + return GetMs1Scan(recordForPrecursor, FrameProxyFactory.GetFrameProxy(frameId), (FilteringParams)filteringParams); + case TimsTofMsMsType.PASEF: + return BuildPasefScanFromPrecursor([precursorId], (FilteringParams)filteringParams).FirstOrDefault(); + default: + throw new NotImplementedException(); + } + } + + internal void CountFrames() + { + if (_sqlConnection == null) return; + using var command = new SQLiteCommand(_sqlConnection); + command.CommandText = @"SELECT COUNT(*) FROM Frames;"; + using var sqliteReader = command.ExecuteReader(); + int count = 0; + while (sqliteReader.Read()) + { + count = sqliteReader.GetInt32(0); + break; + } + NumberOfFrames = count; + } + + internal void CountMS1Frames() + { + if (_sqlConnection == null) return; + using var command = new SQLiteCommand(_sqlConnection); + command.CommandText = @"SELECT f.Id FROM Frames f WHERE f.MsMsType = 0;"; + using var sqliteReader = command.ExecuteReader(); + Ms1FrameIds = new(); + + while (sqliteReader.Read()) + { + Ms1FrameIds.Add(sqliteReader.GetInt64(0)); + } + + } + + /// + /// Builds a new FrameProxyFactory to pull frames from the timsTOF data file + /// and sets the FrameProxyFactory property + /// + /// + internal void BuildProxyFactory() + { + if (_sqlConnection == null || _fileHandle == null) return; + var framesTable = new FrameTable(_sqlConnection, NumberOfFrames); + if (framesTable == null) + throw new MzLibException("Something went wrong while loading the Frames table from the analysis.tdf database."); + + int numberOfIndexedMzs = GetNumberOfDigitizerSamples(); + FrameProxyFactory = new FrameProxyFactory(framesTable, (ulong)_fileHandle, _fileLock, numberOfIndexedMzs); + } + + internal void CountPrecursors() + { + if (_sqlConnection == null) return; + using var command = new SQLiteCommand(_sqlConnection); + command.CommandText = @"SELECT MAX(Id) FROM Precursors;"; + using var sqliteReader = command.ExecuteReader(); + var columns = Enumerable.Range(0, sqliteReader.FieldCount) + .Select(sqliteReader.GetName).ToList(); + long maxPrecursorId = 0; + while (sqliteReader.Read()) + { + maxPrecursorId = sqliteReader.GetInt64(0); + } + Ms1ScanArray = new TimsDataScan[maxPrecursorId]; + PasefScanArray = new TimsDataScan[maxPrecursorId]; + } + + public ConcurrentBag Ms1ScansNoPrecursorsBag { internal get; set; } + public TimsDataScan[] Ms1ScanArray { internal get; set; } + public TimsDataScan[] PasefScanArray { internal get; set; } + + internal int GetNumberOfDigitizerSamples() + { + using var command = new SQLiteCommand(_sqlConnection); + command.CommandText = @"SELECT value FROM GlobalMetadata" + + " WHERE GlobalMetadata.Key = 'DigitizerNumSamples'"; + using var reader = command.ExecuteReader(); + reader.Read(); + return Int32.Parse(reader.GetString(0)); + } + + public override MsDataFile LoadAllStaticData(FilteringParams filteringParams = null, int maxThreads = 1) + { + InitiateDynamicConnection(); + + CountMS1Frames(); + CountPrecursors(); + + _maxThreads = maxThreads; + Ms1ScansNoPrecursorsBag = new(); + Parallel.ForEach( + Partitioner.Create(0, Ms1FrameIds.Count), + new ParallelOptions() { MaxDegreeOfParallelism = _maxThreads }, + (range) => + { + for (int i = range.Item1; i < range.Item2; i++) + { + BuildAllScans(Ms1FrameIds[i], filteringParams); + } + }); + + CloseDynamicConnection(); + AssignOneBasedPrecursorsToPasefScans(); + SourceFile = GetSourceFile(); + return this; + } + + internal void AssignOneBasedPrecursorsToPasefScans() + { + var localMs1Scans = this.Ms1ScanArray.Where(scan => scan != null).OrderBy(scan => scan.FrameId).ThenBy(scan => scan.PrecursorId).ToList(); + var localPasefScans = this.PasefScanArray.Where(scan => scan != null).OrderBy(scan => scan.PrecursorId).ToList(); + var localMs1ScansNoPrecursor = Ms1ScansNoPrecursorsBag.OrderBy(scan => scan.FrameId).ToList(); + TimsDataScan[] scanArray = new TimsDataScan[localMs1Scans.Count*2 + localMs1ScansNoPrecursor.Count]; + + int oneBasedScanIndex = 1; + int pasefScanIndex = 0; + int ms1NoPrecursorIndex = 0; + TimsDataScan? ms1ScanNoPrecursor = localMs1ScansNoPrecursor.IsNotNullOrEmpty() ? localMs1ScansNoPrecursor[ms1NoPrecursorIndex] : null; + //Write the scans to the scanArray and assign scan indices + for (int i = 0; i < localMs1Scans.Count; i++) + { + var ms1Scan = localMs1Scans[i]; + while (ms1ScanNoPrecursor != null && ms1ScanNoPrecursor.FrameId < ms1Scan.FrameId) + { + ms1ScanNoPrecursor.SetOneBasedScanNumber(oneBasedScanIndex); + scanArray[oneBasedScanIndex - 1] = ms1ScanNoPrecursor; + ms1NoPrecursorIndex++; + oneBasedScanIndex++; + ms1ScanNoPrecursor = ms1NoPrecursorIndex < localMs1ScansNoPrecursor.Count ? localMs1ScansNoPrecursor[ms1NoPrecursorIndex] : null; + } + ms1Scan.SetOneBasedScanNumber(oneBasedScanIndex); + scanArray[oneBasedScanIndex - 1] = ms1Scan; + oneBasedScanIndex++; + //if (ms1Scan.PrecursorId == -1) continue; // Continue if the scan didn't have any precursors (as there will be no MS2 scans) + + // This assumes that there is a one to one correspondence between the MS1 scans and the PASEF scans + var pasefScan = localPasefScans[pasefScanIndex]; + while(pasefScan.PrecursorId < ms1Scan.PrecursorId) + { + pasefScanIndex++; + pasefScan = localPasefScans[pasefScanIndex]; + } + if(pasefScan.PrecursorId == ms1Scan.PrecursorId) + { + pasefScan.SetOneBasedPrecursorScanNumber(ms1Scan.OneBasedScanNumber); + pasefScan.SetOneBasedScanNumber(oneBasedScanIndex); + scanArray[oneBasedScanIndex - 1] = pasefScan; + pasefScanIndex++; + oneBasedScanIndex++; + } + } + + if(oneBasedScanIndex < scanArray.Length) + { + // Some MS1 scans contain no peaks where the precursor was identified, so they are not included in the scanArray + scanArray = scanArray.Where(scan => scan != null).ToArray(); + } + + Scans = scanArray; + } + + /// + /// This function will create multiple MS1 scans from each MS1 frame in the timsTOF data file + /// One Ms1 Scan per precursor + /// It will then create an Ms2 scan for each precursor by averaging MS2 spectra for the precursor + /// collected over multiple frames + /// Created scans are then added to their respective scan arrays + /// + /// + /// + internal void BuildAllScans(long frameId, FilteringParams filteringParams) + { + FrameProxy frame = FrameProxyFactory.GetFrameProxy(frameId); + var records = GetMs1Records(frameId); + foreach(Ms1Record record in records) + { + TimsDataScan? dataScan = GetMs1Scan(record, frame, filteringParams); + if (dataScan != null) + { + if (dataScan.PrecursorId > 0) + Ms1ScanArray[(int)dataScan.PrecursorId - 1] = dataScan; + else + Ms1ScansNoPrecursorsBag.Add(dataScan); + } + } + + // Then, build ONE MS2 scan by averaging every PASEF frame that sampled that precursor + var pasefScans = BuildPasefScanFromPrecursor(precursorIds: records.Select(r => r.PrecursorId), filteringParams); + foreach (var scan in pasefScans) + { + if (scan?.PrecursorId != null) + PasefScanArray[(int)scan.PrecursorId - 1] = scan; + } + } + + internal List GetMs1Records(long frameId) + { + List records = new List(); + // Only do this if we have valid precursors (which we don't for like SRM/inclusion list type stuff) + using (var command = new SQLiteCommand(_sqlConnection)) + { + // This command finds all the precursors identified and fragmented in each MS/MS Pasef scan + // It is used to take an MS1 frame and create multiple "MsDataScans" by averaging the + // spectra from each scan within a given Ion Mobility (i.e. ScanNum) range + command.CommandText = + @"SELECT MIN(m.ScanNumBegin), MAX(m.ScanNumEnd), p.ScanNumber, p.Id" + + " FROM Precursors p" + + " INNER JOIN PasefFrameMsMsInfo m on m.Precursor = p.Id" + + " WHERE p.Parent = " + frameId.ToString() + + " GROUP BY p.Id;"; + using var sqliteReader = command.ExecuteReader(); + + while (sqliteReader.Read()) + { + var scanStart = sqliteReader.GetInt32(0); + var scanEnd = sqliteReader.GetInt32(1); + var scanMedian = sqliteReader.GetFloat(2); + int precursorId = sqliteReader.GetInt32(3); + records.Add(new Ms1Record(precursorId, scanStart, scanEnd, (double)scanMedian)); + } + } + // If no records were returned, then no precursors were observed in the frame + // In that case, create a record that contains every scan and a precursorID of -1 + if (records.Count == 0) + records.Add(new Ms1Record(-1, 1, FrameProxyFactory.FramesTable.NumScans[frameId - 1], FrameProxyFactory.FramesTable.NumScans[frameId - 1])); + return records; + } + + internal TimsDataScan? GetMs1Scan(Ms1Record record, FrameProxy frame, FilteringParams filteringParams) + { + List indexArrays = new(); + List intensityArrays = new(); + for (int scan = record.ScanStart; scan < record.ScanEnd; scan++) + { + indexArrays.Add(frame.GetScanIndices(scan-1)); + intensityArrays.Add(frame.GetScanIntensities(scan-1)); + } + // Step 2: Average those suckers + MzSpectrum averagedSpectrum = TofSpectraMerger.MergeArraysToMs1Spectrum(indexArrays, intensityArrays, FrameProxyFactory, filteringParams: filteringParams); + if (averagedSpectrum.Size < 1) + { + return null; + } + // Step 3: Make an MsDataScan bby + var dataScan = new TimsDataScan( + massSpectrum: averagedSpectrum, + oneBasedScanNumber: -1, // This gets adjusted once all data has been read + msnOrder: 1, + isCentroid: true, + polarity: FrameProxyFactory.GetPolarity(frame.FrameId), + retentionTime: FrameProxyFactory.GetRetentionTime(frame.FrameId), + scanWindowRange: ScanWindow, + scanFilter: ScanFilter, + mzAnalyzer: MZAnalyzerType.TOF, + totalIonCurrent: intensityArrays.Sum(array => array.Sum()), + injectionTime: FrameProxyFactory.GetInjectionTime(frame.FrameId), + noiseData: null, + nativeId: "frame=" + frame.FrameId.ToString() + + ";scans=" + record.ScanStart.ToString() + "-" + record.ScanEnd.ToString() + + ";precursor=" + record.PrecursorId.ToString(), + frameId: frame.FrameId, + scanNumberStart: record.ScanStart, + scanNumberEnd: record.ScanEnd, + medianOneOverK0: FrameProxyFactory.GetOneOverK0(record.ScanMedian), + precursorId: record.PrecursorId); + + return dataScan; + } + + internal List BuildPasefScanFromPrecursor(IEnumerable precursorIds, FilteringParams filteringParams) + { + HashSet allFrames = new(); + List pasefScans = new(); + + // Create TimsDataScans with all relevant metadata, but without an mzSpectrum + foreach (PasefRecord record in GetPasefRecords(precursorIds)) + { + allFrames.UnionWith(record.FrameList); + var dataScan = new TimsDataScan( + massSpectrum: null, + oneBasedScanNumber: -1, // This will be adjusted once all scans have been read + msnOrder: 2, + isCentroid: true, + polarity: FrameProxyFactory.GetPolarity(record.FrameList.First()), + retentionTime: FrameProxyFactory.GetRetentionTime(record.FrameList.First()), + scanWindowRange: ScanWindow, + scanFilter: ScanFilter, + mzAnalyzer: MZAnalyzerType.TOF, + totalIonCurrent: -1, // Will be set later + injectionTime: FrameProxyFactory.GetInjectionTimeSum(record.FrameList.First(), record.FrameList.Last()), + noiseData: null, + nativeId: "frames=" + record.FrameList.First().ToString() + "-" + record.FrameList.Last().ToString() + + ";scans=" + record.ScanStart.ToString() + "-" + record.ScanEnd.ToString(), + frameId: record.FrameList.First(), + scanNumberStart: record.ScanStart, + scanNumberEnd: record.ScanEnd, + medianOneOverK0: FrameProxyFactory.GetOneOverK0(record.ScanMedian), // Needs to be set later + precursorId: record.PrecursorId, + selectedIonMz: record.MostAbundantPrecursorMz, + selectedIonChargeStateGuess: record.Charge, + selectedIonIntensity: record.PrecursorIntensity, + isolationMZ: record.IsolationMz, + isolationWidth: record.IsolationWidth, + dissociationType: DissociationType.CID, + oneBasedPrecursorScanNumber: -1, // This will be set later + selectedIonMonoisotopicGuessMz: record.PrecursorMonoisotopicMz, + hcdEnergy: record.CollisionEnergy.ToString(), + frames: record.FrameList.ToList()); + pasefScans.Add(dataScan); + } + + // Grab all fragmentation spectra for each precursor + // Each TimsDataScan in pasefScans corresponds to one precursor. + // A precursor can be isolated and fragmented in multiple pasef frames + // Here, we iterate through each frame, averaging the scans that correspond to each precursor + foreach (long frameId in allFrames) + { + FrameProxy frame = FrameProxyFactory.GetFrameProxy(frameId); + //Iterate through all the datascans created above with this frame + foreach (var scan in pasefScans) + { + if (scan.FrameIds.Contains(frameId)) + { + List indexArrays = new(); + List intensityArrays = new(); + for (int mobilityScanIdx = scan.ScanNumberStart; mobilityScanIdx < scan.ScanNumberEnd; mobilityScanIdx++) + { + indexArrays.Add(frame.GetScanIndices(mobilityScanIdx-1)); + intensityArrays.Add(frame.GetScanIntensities(mobilityScanIdx-1)); + } + // Perform frame level averaging, where all scans from one frame associated with a given precursor are merged and centroided + // Need to convert indexArrays to one uint[] and intensityArrays to one int[] + (double[] Mzs, int[] Intensities) summedArrays = TofSpectraMerger.MergeArraysToMzArray(indexArrays, intensityArrays, FrameProxyFactory); + scan.AddComponentArrays(summedArrays.Mzs, summedArrays.Intensities); + } + } + } + + // Now, we average the fragmentation spectra (each spectra originating in a different frame) + // to yield one spectrum per precursor + foreach (TimsDataScan scan in pasefScans) + { + scan.AverageComponentSpectra(FrameProxyFactory, filteringParams); + } + + return pasefScans; + } + + internal IEnumerable GetPasefRecords(IEnumerable precursorIds) + { + using (var command = new SQLiteCommand(_sqlConnection)) + { + string multiplePrecursorString = "(" + + String.Join(',', precursorIds.Select(id => "\'" + id.ToString() + "\'")) + + ")"; + // SQL Command for getting some info from both PasefFrameMsMsInfo table and + // Precursors table + command.CommandText = + @"SELECT GROUP_CONCAT(m.Frame), m.ScanNumBegin, m.ScanNumEnd, m.IsolationMz, m.IsolationWidth," + + " m.CollisionEnergy, p.LargestPeakMz, p.MonoisotopicMz, p.Charge, p.Intensity, p.ScanNumber, p.Id" + + " FROM PasefFrameMsMsInfo m" + + " INNER JOIN Precursors p on m.Precursor = p.Id" + + " WHERE m.Precursor IN " + multiplePrecursorString + + " GROUP BY m.Precursor;"; + + using var sqliteReader = command.ExecuteReader(); + + // Each call to read returns the information associated with a given precursor + while (sqliteReader.Read()) + { + var frameList = sqliteReader.GetString(0).Split(',').Select(id => Int64.Parse(id)); + var scanStart = sqliteReader.GetInt32(1); + var scanEnd = sqliteReader.GetInt32(2); + var isolationMz = sqliteReader.GetFloat(3); + var isolationWidth = sqliteReader.GetFloat(4); + var collisionEnergy = sqliteReader.GetFloat(5); + var mostAbundantPrecursorPeak = sqliteReader.GetFloat(6); + float precursorMonoisotopicMz = sqliteReader.IsDBNull(7) ? isolationMz : sqliteReader.GetFloat(7); + int charge = sqliteReader.IsDBNull(8) ? 1 : sqliteReader.GetInt32(8); + var precursorIntensity = sqliteReader.GetFloat(9); + var scanMedian = sqliteReader.GetFloat(10); + var precursorId = sqliteReader.GetInt32(11); + + yield return new PasefRecord(frameList, precursorId, scanStart, scanEnd, scanMedian, isolationMz, isolationWidth, collisionEnergy, mostAbundantPrecursorPeak, precursorMonoisotopicMz, charge, precursorIntensity); + } + } + } + + private const string nativeIdFormat = "Frame ID + scan number range format"; + private const string massSpecFileFormat = ".D format"; + public override SourceFile GetSourceFile() + { + // append the analysis.baf because the constructor for SourceFile will look for the + // parent directory. + string fileName = FilePath + @"\analysis.tdf"; + return new SourceFile(nativeIdFormat, massSpecFileFormat, + null, null, id: null, filePath: fileName); + } + + #region Bruker Dll Functions + + /// + /// Returns a unique handle that references an open timsTOF data file + /// + /// + /// + /// + [DllImport("timsdata.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)] + public static extern UInt64 tims_open + (byte[] analysis_directory_name_utf8, UInt32 use_recalibrated_state); + + /// + /// Closes a file connection to a .tdf binary file + /// + [DllImport("timsdata.dll", CharSet = CharSet.Ansi, CallingConvention = CallingConvention.Cdecl)] + public static extern void tims_close + (UInt64 fileHandle); + + #endregion Bruker Dll Functions + + } +} diff --git a/mzLib/Readers/timsTOF/TofSpectraMerger.cs b/mzLib/Readers/timsTOF/TofSpectraMerger.cs new file mode 100644 index 000000000..afab2082c --- /dev/null +++ b/mzLib/Readers/timsTOF/TofSpectraMerger.cs @@ -0,0 +1,384 @@ +using Easy.Common.Extensions; +using MassSpectrometry; +using MzLibUtil; + +namespace Readers +{ + public static class TofSpectraMerger + { + public static readonly double DefaultPpmTolerance = 10; + + #region IndexLevelOperations + // The following methods are used to merge and collapse index arrays and intensity arrays + // The timsTOF data format doesn't store m/z values directly, but rather indices in a lookup table where the mz values are stored + // Keeping these indices as ints allows for more efficient storage and processing of the data + + /// + /// Merges two index and intensity arrays using a two-pointer technique. + /// The merged arrays are sorted by index, ascending + /// + /// First index array. + /// Second index array. + /// First intensity array. + /// Second intensity array. + /// A tuple containing the merged indices and intensities. + public static (uint[] Indices, int[] Intensities) TwoPointerMerge(uint[] indexArray1, uint[] indexArray2, int[] intensityArray1, int[] intensityArray2) + { + int p1 = 0; + int p2 = 0; + + uint[] mergedIndices = new uint[indexArray1.Length + indexArray2.Length]; + int[] mergedIntensities = new int[intensityArray1.Length + intensityArray2.Length]; + + while (p1 < indexArray1.Length || p2 < indexArray2.Length) + { + if (p1 == indexArray1.Length) + { + while (p2 < indexArray2.Length) + { + mergedIndices[p1 + p2] = indexArray2[p2]; + mergedIntensities[p1 + p2] = intensityArray2[p2]; + p2++; + } + } + else if (p2 == indexArray2.Length) + { + while (p1 < indexArray1.Length) + { + mergedIndices[p1 + p2] = indexArray1[p1]; + mergedIntensities[p1 + p2] = intensityArray1[p1]; + p1++; + } + } + else if (indexArray1[p1] < indexArray2[p2]) + { + mergedIndices[p1 + p2] = indexArray1[p1]; + mergedIntensities[p1 + p2] = intensityArray1[p1]; + p1++; + } + else + { + mergedIndices[p1 + p2] = indexArray2[p2]; + mergedIntensities[p1 + p2] = intensityArray2[p2]; + p2++; + } + } + + return (mergedIndices, mergedIntensities); + } + + /// + /// Collapses the given index and intensity arrays. + /// Adjacent index values (and their corresponding intensity values) are merged. + /// The idea here is to centroid a spectrum + /// + /// The index array to collapse. + /// The intensity array to collapse. + /// A tuple containing the collapsed indices and intensities. + public static (uint[] Indices, int[] Intensities) CollapseArrays(uint[] indexArray, int[] intensityArray) + { + // Define lists to store the collapsed indices and intensities + List collapsedIndices = new List(indexArray.Length); + List collapsedIntensities = new List(intensityArray.Length); + + // Initialize pointers to the first two elements in the index array + int p1 = 0; + int p2 = 1; + while (p1 < indexArray.Length) + { + uint currentIdx = indexArray[p1]; + + // Find clusters of indices that are close together + // increment pointer 2 until the cluster ends and we're further than 3 indices away + while (p2 < indexArray.Length && (2 + currentIdx) >= indexArray[p2]) + { + p2++; + } + p2--; // Move the pointer back by one + int medianPointer = (p1 + p2) / 2; + // Use the median index in each cluster as the collapsed index + collapsedIndices.Add(indexArray[medianPointer]); + + // Sum the intensities in each cluster to get the collapsed intensity + int summedIntensity = 0; + for (int i = p1; i <= p2; i++) + { + summedIntensity += intensityArray[i]; + } + collapsedIntensities.Add(summedIntensity); + + // Move the pointers forward + p1 = p2 + 1; + p2 = p1 + 1; + } + + collapsedIndices.TrimExcess(); + collapsedIntensities.TrimExcess(); + + return (collapsedIndices.ToArray(), collapsedIntensities.ToArray()); + } + + #endregion + #region MzLevelOperations + + internal static MzSpectrum CreateFilteredSpectrum(IList mzs, IList intensities, + FilteringParams filteringParams = null, int msnLevel = 1) + { + double[] mzsArray; + if (mzs is double[]) + mzsArray = (double[])mzs; + else + mzsArray = mzs.ToArray(); + + // Convert the intensities to an array + double[] intensitiesArray = intensities.Select(intensity => (double)intensity).ToArray(); + + if (mzsArray.Length != intensitiesArray.Length) + throw new Exception("Collapsed m/z and intensity arrays are not the same length."); + + if (filteringParams != null + && mzsArray.Length > 0 + && ((filteringParams.ApplyTrimmingToMs1 && msnLevel == 1) + || (filteringParams.ApplyTrimmingToMsMs && msnLevel > 1))) + { + WindowModeHelper.Run(ref intensitiesArray, + ref mzsArray, filteringParams, + mzsArray[0], mzsArray[^1]); + } + // TODO: This would be more performant if we kept the intensities as ints + return new MzSpectrum(mzsArray, intensitiesArray, shouldCopy: false); + } + + /// + /// Merges multiple index and intensity arrays into an MS1 spectrum. + /// This operation is somewhere between averaging and centroiding + /// In the TimsTofFileReader, MS1 scans are kept as index arrays and intensity arrays. + /// + /// List of index arrays. + /// List of intensity arrays. + /// Frame proxy factory. + /// Filtering parameters (optional). + /// A merged MS1 spectrum. + internal static MzSpectrum MergeArraysToMs1Spectrum( + List indexArrays, + List intensityArrays, + FrameProxyFactory proxyFactory, + FilteringParams filteringParams = null) + { + if (!indexArrays.IsNotNullOrEmpty() || intensityArrays == null || intensityArrays.Count() != indexArrays.Count()) + return null; + + // Merge all index arrays and intensity arrays into a single array + uint[] combinedIndices = indexArrays[0]; + int[] combinedIntensities = intensityArrays[0]; + for (int i = 1; i < indexArrays.Count(); i++) + { + var mergeResults = TwoPointerMerge(combinedIndices, indexArrays[i], combinedIntensities, intensityArrays[i]); + combinedIndices = mergeResults.Indices; + combinedIntensities = mergeResults.Intensities; + } + + // Collapse the combined arrays into a single array (centroiding, more or less) + var centroidedResults = CollapseArrays(proxyFactory.ConvertIndicesToMz(combinedIndices), combinedIntensities); + + return CreateFilteredSpectrum( + centroidedResults.Mzs, + centroidedResults.Intensities, + filteringParams, + msnLevel: 1); + } + + /// + /// Merges multiple m/z and intensity arrays into an MS2 spectrum. + /// This operation is somewhere between averaging and centroiding. + /// In the TimsTofFileReader, MS2 component spectrum are stored as + /// double[] m/z arrays and int[] intensity arrays. + /// Each component scan + /// + /// List of m/z arrays. + /// List of intensity arrays. + /// Filtering parameters (optional). + /// PPM tolerance value (default is -1). + /// A merged MS2 spectrum. + internal static MzSpectrum MergeArraysToMs2Spectrum( + List mzArrays, + List intensityArrays, + FilteringParams filteringParams = null, + double ppmTolerance = -1) + { + if (!mzArrays.IsNotNullOrEmpty() || intensityArrays == null || intensityArrays.Count() != mzArrays.Count()) + return null; + + // Merge all index arrays and intensity arrays into a single array + double[] combinedMzs = mzArrays[0]; + int[] combinedIntensities = intensityArrays[0]; + for (int i = 1; i < mzArrays.Count(); i++) + { + var mergeResults = TwoPointerMerge(combinedMzs, mzArrays[i], combinedIntensities, intensityArrays[i]); + combinedMzs = mergeResults.Mzs; + combinedIntensities = mergeResults.Intensities; + } + + // Collapse the combined arrays into a single array (centroiding, more or less) + var centroidedResults = CollapseArrays(combinedMzs, combinedIntensities, ppmTolerance); + + return CreateFilteredSpectrum( + centroidedResults.Mzs, + centroidedResults.Intensities, + filteringParams, + msnLevel: 2); + } + + /// + /// Merges two m/z and intensity arrays using a two-pointer technique. + /// Used when merging component spectra into one MS2 spectrum + /// + /// First m/z array. + /// Second m/z array. + /// First intensity array. + /// Second intensity array. + /// A tuple containing the merged m/z values and intensities. + public static (double[] Mzs, int[] Intensities) TwoPointerMerge(double[] mzArray1, double[] mzArray2, int[] intensityArray1, int[] intensityArray2) + { + int p1 = 0; + int p2 = 0; + + double[] mergedMzs = new double[mzArray1.Length + mzArray2.Length]; + int[] mergedIntensities = new int[intensityArray1.Length + intensityArray2.Length]; + + while (p1 < mzArray1.Length || p2 < mzArray2.Length) + { + if (p1 == mzArray1.Length) + { + while (p2 < mzArray2.Length) + { + mergedMzs[p1 + p2] = mzArray2[p2]; + mergedIntensities[p1 + p2] = intensityArray2[p2]; + p2++; + } + } + else if (p2 == mzArray2.Length) + { + while (p1 < mzArray1.Length) + { + mergedMzs[p1 + p2] = mzArray1[p1]; + mergedIntensities[p1 + p2] = intensityArray1[p1]; + p1++; + } + } + else if (mzArray1[p1] < mzArray2[p2]) + { + mergedMzs[p1 + p2] = mzArray1[p1]; + mergedIntensities[p1 + p2] = intensityArray1[p1]; + p1++; + } + else + { + mergedMzs[p1 + p2] = mzArray2[p2]; + mergedIntensities[p1 + p2] = intensityArray2[p2]; + p2++; + } + } + + return (mergedMzs, mergedIntensities); + } + + /// + /// Collapses the given mz and intensity arrays. + /// mz values within ppmTolerance (and their corresponding intensity values) are merged. + /// The idea here is to centroid a spectrum + /// + /// The mz array to collapse. + /// The intensity array to collapse. + /// /// PPM tolerance value (default is 10). + /// A tuple containing the collapsed mz and intensities. + internal static (double[] Mzs, int[] Intensities) CollapseArrays(double[] mzArray, int[] intensityArray, double ppmTolerance = 10) + { + // Define lists to store the collapsed indices and intensities + List collapsedMzs = new(); + List collapsedIntensities = new(); + + PpmTolerance tol = new(ppmTolerance < 1 ? DefaultPpmTolerance : ppmTolerance); + + // Initialize pointers to the first two elements in the index array + int p1 = 0; + int p2 = 1; + while (p1 < mzArray.Length) + { + double currentMz = mzArray[p1]; + double upperBoundMz = tol.GetMaximumValue(currentMz); + + // Find clusters of indices that are close together + // increment pointer 2 until the cluster ends and we're further than 3 indices away + while (p2 < mzArray.Length && upperBoundMz >= mzArray[p2]) + { + upperBoundMz = tol.GetMaximumValue(mzArray[p2]); + p2++; + } + p2--; // Move the pointer back by one + + if(p1 == p2) + { + collapsedIntensities.Add(intensityArray[p1]); + collapsedMzs.Add(mzArray[p1]); + } + else + { + // Calculate the summed intensity in the cluster + int summedIntensity = 0; + for (int i = p1; i <= p2; i++) + { + summedIntensity += intensityArray[i]; + } + collapsedIntensities.Add(summedIntensity); + + // weighted averaging to determine the collapsed m/z of the cluster + double collapsedMz = 0; + for (int i = p1; i <= p2; i++) + { + double weight = (double)intensityArray[i] / (double)summedIntensity; + collapsedMz += weight * mzArray[i]; + } + collapsedMzs.Add(collapsedMz); + } + + // Move the pointers forward + p1 = p2 + 1; + p2 = p1 + 1; + } + + return (collapsedMzs.ToArray(), collapsedIntensities.ToArray()); + } + + /// + /// Merges multiple index and intensity arrays into an m/z array. + /// Used when building the component spectra for an MS2 scan + /// + /// List of index arrays. + /// List of intensity arrays. + /// Frame proxy factory. + /// A tuple containing the merged m/z values and intensities. + internal static (double[] Mzs, int[] Intensities) MergeArraysToMzArray(List indexArrays, List intensityArrays, FrameProxyFactory proxyFactory) + { + if (!indexArrays.IsNotNullOrEmpty() || intensityArrays == null || intensityArrays.Count() != indexArrays.Count()) + return (new double[0], new int[0]); + + // Merge all index arrays and intensity arrays into a single array + uint[] combinedIndices = indexArrays[0]; + int[] combinedIntensities = intensityArrays[0]; + for (int i = 1; i < indexArrays.Count(); i++) + { + var mergeResults = TwoPointerMerge(combinedIndices, indexArrays[i], combinedIntensities, intensityArrays[i]); + combinedIndices = mergeResults.Indices; + combinedIntensities = mergeResults.Intensities; + } + double[] mzsArray = proxyFactory.ConvertIndicesToMz(combinedIndices); + + // Collapse the combined arrays into a single array (centroiding, more or less) + return CollapseArrays(mzsArray, combinedIntensities); + } + + #endregion + + } +} \ No newline at end of file diff --git a/mzLib/Readers/timsTOF/baf2sql_c.dll b/mzLib/Readers/timsTOF/baf2sql_c.dll new file mode 100644 index 000000000..3fb55e098 Binary files /dev/null and b/mzLib/Readers/timsTOF/baf2sql_c.dll differ diff --git a/mzLib/Readers/timsTOF/baf2sql_c.lib b/mzLib/Readers/timsTOF/baf2sql_c.lib new file mode 100644 index 000000000..a843d6add Binary files /dev/null and b/mzLib/Readers/timsTOF/baf2sql_c.lib differ diff --git a/mzLib/Readers/timsdata.dll b/mzLib/Readers/timsdata.dll new file mode 100644 index 000000000..200944d6f Binary files /dev/null and b/mzLib/Readers/timsdata.dll differ diff --git a/mzLib/Readers/timsdata.lib b/mzLib/Readers/timsdata.lib new file mode 100644 index 000000000..a97952a98 Binary files /dev/null and b/mzLib/Readers/timsdata.lib differ diff --git a/mzLib/SpectralAveraging/Algorithms/SpectraAveraging.cs b/mzLib/SpectralAveraging/Algorithms/SpectraAveraging.cs index 0bf3f7230..b93d473ab 100644 --- a/mzLib/SpectralAveraging/Algorithms/SpectraAveraging.cs +++ b/mzLib/SpectralAveraging/Algorithms/SpectraAveraging.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Concurrent; using System.Collections.Generic; using System.Linq; using System.Threading.Tasks; @@ -39,8 +40,7 @@ public static double[][] AverageSpectra(double[][] xArrays, double[][] yArrays, /// yArrays of spectra to be averaged /// how to perform the averaging /// - private static double[][] MzBinning(double[][] xArrays, double[][] yArrays, - SpectralAveragingParameters parameters) + private static double[][] MzBinning(double[][] xArrays, double[][] yArrays, SpectralAveragingParameters parameters) { // get tics var tics = yArrays.Select(p => p.Sum()).ToArray(); @@ -56,21 +56,20 @@ private static double[][] MzBinning(double[][] xArrays, double[][] yArrays, var weights = SpectralWeighting.CalculateSpectraWeights(xArrays, yArrays, parameters.SpectralWeightingType); // reject outliers and average bins - List<(double mz, double intensity)> averagedPeaks = new(); - Parallel.ForEach(Enumerable.Range(0, parameters.MaxThreadsToUsePerFile), (iterationIndex) => - { - // each bin index that contains peaks - var binIncidences = bins.Keys.ToList(); + var binIncidences = bins.Keys.ToList(); + (double mz, double intensity)[] averagedPeaks = new (double, double)[binIncidences.Count]; + var partitioner = Partitioner.Create(0, binIncidences.Count); - // iterate through each bin index which contains peaks - for (; iterationIndex < binIncidences.Count; iterationIndex += parameters.MaxThreadsToUsePerFile) + Parallel.ForEach(partitioner, new ParallelOptions { MaxDegreeOfParallelism = parameters.MaxThreadsToUsePerFile }, (range, state) => + { + for (int i = range.Item1; i < range.Item2; i++) { - var peaksFromBin = bins[binIncidences[iterationIndex]]; + var peaksFromBin = bins[binIncidences[i]]; peaksFromBin = OutlierRejection.RejectOutliers(peaksFromBin, parameters); if (!peaksFromBin.Any()) continue; - lock (averagedPeaks) - averagedPeaks.Add(AverageBin(peaksFromBin, weights)); + + averagedPeaks[i] = AverageBin(peaksFromBin, weights); } }); @@ -86,7 +85,6 @@ private static double[][] MzBinning(double[][] xArrays, double[][] yArrays, }; } - #region Helpers /// diff --git a/mzLib/SpectralAveraging/Algorithms/SpectraFileAveraging.cs b/mzLib/SpectralAveraging/Algorithms/SpectraFileAveraging.cs index 28a573ac8..35839d1a5 100644 --- a/mzLib/SpectralAveraging/Algorithms/SpectraFileAveraging.cs +++ b/mzLib/SpectralAveraging/Algorithms/SpectraFileAveraging.cs @@ -54,7 +54,7 @@ private static MsDataScan[] AverageAll(IReadOnlyCollection scans, Sp // create output MsDataScan averagedScan = new(averagedSpectrum, 1, representativeScan.OneBasedScanNumber, representativeScan.IsCentroid, representativeScan.Polarity, scans.Select(p => p.RetentionTime).Average(), - averagedSpectrum.Range, null, representativeScan.MzAnalyzer, scans.Select(p => p.TotalIonCurrent).Average(), + averagedSpectrum.Range, representativeScan.ScanFilter, representativeScan.MzAnalyzer, scans.Select(p => p.TotalIonCurrent).Average(), representativeScan.InjectionTime, null, representativeScan.NativeId); MsDataScan[] msDataScans = { averagedScan }; return msDataScans; @@ -129,7 +129,8 @@ private static MsDataScan GetAveragedDataScanFromAveragedSpectrum(MzSpectrum ave centralScan.IsCentroid, centralScan.Polarity, centralScan.RetentionTime, - averagedSpectrum.Range, null, + averagedSpectrum.Range, + centralScan.ScanFilter, centralScan.MzAnalyzer, averagedSpectrum.SumOfAllY, centralScan.InjectionTime, diff --git a/mzLib/SpectralAveraging/Util/SpectralAveragingParameters.cs b/mzLib/SpectralAveraging/Util/SpectralAveragingParameters.cs index 19f2b8be0..a18839461 100644 --- a/mzLib/SpectralAveraging/Util/SpectralAveragingParameters.cs +++ b/mzLib/SpectralAveraging/Util/SpectralAveragingParameters.cs @@ -37,7 +37,7 @@ public void SetValues(OutlierRejectionType outlierRejectionType = OutlierRejecti SpectralAveragingType spectralAveragingType = SpectralAveragingType.MzBinning, NormalizationType normalizationType = NormalizationType.RelativeToTics, SpectraFileAveragingType specAveragingType = SpectraFileAveragingType.AverageAll, - OutputType outputType = OutputType.MzML, int numToAverage = 5, int overlap = 2, + OutputType outputType = OutputType.MzML, int numToAverage = 5, int overlap = 4, double percentile = 0.1, double minSigma = 1.5, double maxSigma = 1.5, double binSize = 0.01, int maxThreads = 1) { @@ -67,7 +67,7 @@ public void SetDefaultValues() SpectraFileAveragingType = SpectraFileAveragingType.AverageAll; NormalizationType = NormalizationType.RelativeToTics; OutputType = OutputType.MzML; - ScanOverlap = 2; + ScanOverlap = 4; NumberOfScansToAverage = 5; Percentile = 0.1; MinSigmaValue = 1.5; diff --git a/mzLib/Test/AveragingTests/TestAveraging.cs b/mzLib/Test/AveragingTests/TestAveraging.cs index 9995af349..47eb8a729 100644 --- a/mzLib/Test/AveragingTests/TestAveraging.cs +++ b/mzLib/Test/AveragingTests/TestAveraging.cs @@ -40,7 +40,7 @@ public static List DummyMzCopy public static void OneTimeSetup() { ActualScans = MsDataFileReader.GetDataFile(Path.Combine(TestContext.CurrentContext.TestDirectory, - @"AveragingTestData\TDYeastFractionMS1.mzML")).GetAllScansList(); + @"AveragingTests\TestData\TDYeastFractionMS1.mzML")).GetAllScansList(); double[] xArray = new double[] { 100.1453781, 200, 300, 400, 500, 600, 700, 800, 900.4123745 }; double[] yArray1 = new double[] { 0, 5, 0, 0, 0, 0, 0, 10, 0, 0 }; double[] yArray2 = new double[] { 0, 5, 0, 0, 0, 0, 0, 10, 0, 0 }; diff --git a/mzLib/Test/AveragingTests/TestAveragingExtensions.cs b/mzLib/Test/AveragingTests/TestAveragingExtensions.cs index 82419db7e..6b9c27da4 100644 --- a/mzLib/Test/AveragingTests/TestAveragingExtensions.cs +++ b/mzLib/Test/AveragingTests/TestAveragingExtensions.cs @@ -37,7 +37,7 @@ public static void OneTimeSetup() { ActualScans = MsDataFileReader.GetDataFile(Path.Combine(TestContext.CurrentContext.TestDirectory, - @"AveragingTestData\TDYeastFractionMS1.mzML")).GetAllScansList().Take(25).ToList(); + @"AveragingTests\TestData\TDYeastFractionMS1.mzML")).GetAllScansList().Take(25).ToList(); double[] xArray = new double[] { 100.1453781, 200, 300, 400, 500, 600, 700, 800, 900.4123745 }; double[] yArray1 = new double[] { 0, 5, 0, 0, 0, 0, 0, 10, 0, 0 }; double[] yArray2 = new double[] { 0, 5, 0, 0, 0, 0, 0, 10, 0, 0 }; diff --git a/mzLib/Test/AveragingTests/TestAveragingSpectraWriteFile.cs b/mzLib/Test/AveragingTests/TestAveragingSpectraWriteFile.cs index 511eeb35c..69b8a9d33 100644 --- a/mzLib/Test/AveragingTests/TestAveragingSpectraWriteFile.cs +++ b/mzLib/Test/AveragingTests/TestAveragingSpectraWriteFile.cs @@ -25,7 +25,7 @@ public class TestAveragingSpectraWriteFile public static void OneTimeSetup() { Parameters = new SpectralAveragingParameters(); - OutputDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, @"AveragingTestData"); + OutputDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "AveragingTests", "TestData"); SpectraPath = Path.Combine(OutputDirectory, "TDYeastFractionMS1.mzML"); Scans = MsDataFileReader.GetDataFile(SpectraPath).GetAllScansList().Take(50).ToList(); @@ -112,8 +112,8 @@ public static void TestOutputToCustomDirectoryAndNameMzML() { // output to a different directory than the files were originally in Parameters.OutputType = OutputType.MzML; - string customDestinationDirectory = Path.Combine(OutputDirectory, "NewTestingDirectory"); - string customDestinationDirectory2 = Path.Combine(OutputDirectory, "NewTestingDirectory2"); + string customDestinationDirectory = Path.Combine(OutputDirectory, "NewAveragedTestingDirectory"); + string customDestinationDirectory2 = Path.Combine(OutputDirectory, "NewAveragedTestingDirectory2"); Directory.CreateDirectory(customDestinationDirectory); string customName = "AveragedSpectra"; diff --git a/mzLib/Test/AveragingTestData/TDYeastFractionMMResult.psmtsv b/mzLib/Test/AveragingTests/TestData/TDYeastFractionMMResult.psmtsv similarity index 100% rename from mzLib/Test/AveragingTestData/TDYeastFractionMMResult.psmtsv rename to mzLib/Test/AveragingTests/TestData/TDYeastFractionMMResult.psmtsv diff --git a/mzLib/Test/AveragingTestData/TDYeastFractionMS1.mzML b/mzLib/Test/AveragingTests/TestData/TDYeastFractionMS1.mzML similarity index 100% rename from mzLib/Test/AveragingTestData/TDYeastFractionMS1.mzML rename to mzLib/Test/AveragingTests/TestData/TDYeastFractionMS1.mzML diff --git a/mzLib/Test/AveragingTests/TestSpectraFileAveraging.cs b/mzLib/Test/AveragingTests/TestSpectraFileAveraging.cs index f46db019d..771c35ced 100644 --- a/mzLib/Test/AveragingTests/TestSpectraFileAveraging.cs +++ b/mzLib/Test/AveragingTests/TestSpectraFileAveraging.cs @@ -244,7 +244,7 @@ public static List DummyDDAScansOutOfOrder #endregion public static List ActualScans => MsDataFileReader.GetDataFile(Path.Combine(TestContext.CurrentContext.TestDirectory, - @"AveragingTestData\TDYeastFractionMS1.mzML")).GetAllScansList().Take(50).ToList(); + @"AveragingTests\TestData\TDYeastFractionMS1.mzML")).GetAllScansList().Take(50).ToList(); public static string NativeId; diff --git a/mzLib/Test/DataFiles/timsTOF_snippet.d/analysis.tdf b/mzLib/Test/DataFiles/timsTOF_snippet.d/analysis.tdf new file mode 100644 index 000000000..082f62298 Binary files /dev/null and b/mzLib/Test/DataFiles/timsTOF_snippet.d/analysis.tdf differ diff --git a/mzLib/Test/DataFiles/timsTOF_snippet.d/analysis.tdf_bin b/mzLib/Test/DataFiles/timsTOF_snippet.d/analysis.tdf_bin new file mode 100644 index 000000000..152a26cd1 Binary files /dev/null and b/mzLib/Test/DataFiles/timsTOF_snippet.d/analysis.tdf_bin differ diff --git a/mzLib/Test/DatabaseTests/SingleEntry_ModOrder1.xml b/mzLib/Test/DatabaseTests/SingleEntry_ModOrder1.xml new file mode 100644 index 000000000..3b7232bd4 --- /dev/null +++ b/mzLib/Test/DatabaseTests/SingleEntry_ModOrder1.xml @@ -0,0 +1,3617 @@ + + + ID (3R)-3-hydroxyasparagine on N +AC PTM-0369 +MT UniProt +FT MOD_RES +TG N +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00035 +DR RESID; AA0026 +TR Eukaryota; taxId:40674 (Mammalia) +KW Hydroxylation + +// + ID (3R)-3-hydroxyaspartate on D +AC PTM-0371 +MT UniProt +FT MOD_RES +TG D +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00036 +DR RESID; AA0027 +TR Bacteria; taxId:68215 (Streptomyces griseoverticillatus) +TR Eukaryota; taxId:40674 (Mammalia) +KW Hydroxylation + +// + ID (3S)-3-hydroxyasparagine on N +AC PTM-0370 +MT UniProt +FT MOD_RES +TG N +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:01401 +DR RESID; AA0478 +TR Eukaryota; taxId:7742 (Vertebrata) +KW Hydroxylation + +// + ID (3S)-3-hydroxyaspartate on D +AC PTM-0473 +MT UniProt +FT MOD_RES +TG D +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:01919 +TR Eukaryota; taxId:33208 (Metazoa) +KW Hydroxylation + +// + ID (3S)-3-hydroxyhistidine on H +AC PTM-0477 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:01920 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID (4R)-5-hydroxyleucine on L +AC PTM-0491 +MT UniProt +FT MOD_RES +TG L +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:01373 +DR RESID; AA0443 +TR Eukaryota; taxId:33208 (Metazoa) +KW Hydroxylation + +// + ID (4R)-5-oxoleucine on L +AC PTM-0492 +MT UniProt +FT MOD_RES +TG L +PP Anywhere. +CF H-2O +MM 13.979265 +DR PSI-MOD; MOD:01374 +DR RESID; AA0444 +TR Eukaryota; taxId:33208 (Metazoa) +KW Oxidation + +// + ID 2',4',5'-topaquinone on Y +AC PTM-0009 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF H-2O2 +MM 29.974179 +DR PSI-MOD; MOD:00156 +DR RESID; AA0147 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW TPQ + +// + ID 3-hydroxyasparagine on N +AC PTM-0028 +MT UniProt +FT MOD_RES +TG N +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00035 +DR RESID; AA0026 +TR Eukaryota; taxId:7742 (Vertebrata) +KW Hydroxylation + +// + ID 3-hydroxyproline on P +AC PTM-0030 +MT UniProt +FT MOD_RES +TG P +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00038 +DR RESID; AA0029 +TR Eukaryota; taxId:33208 (Metazoa) +KW Hydroxylation + +// + ID 3-oxoalanine (Cys) on C +AC PTM-0033 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF H-2OS-1 +MM -17.992806 +DR PSI-MOD; MOD:00193 +DR RESID; AA0185 +TR Bacteria; taxId:1224 (Proteobacteria), taxId:1239 (Firmicutes) +TR Eukaryota; taxId:3041 (Chlorophyta), taxId:33208 (Metazoa) + +// + ID 3'-nitrotyrosine on Y +AC PTM-0434 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF H-1NO2 +MM 44.985078 +DR PSI-MOD; MOD:01786 +DR RESID; AA0537 +TR Eukaryota; taxId:40674 (Mammalia) +KW Nitration + +// + ID 4-carboxyglutamate on E +AC PTM-0039 +MT UniProt +FT MOD_RES +TG E +PP Anywhere. +CF CO2 +MM 43.989829 +DR PSI-MOD; MOD:00041 +DR RESID; AA0032 +TR Eukaryota; taxId:6447 (Mollusca), taxId:7742 (Vertebrata) +KW Gamma-carboxyglutamic acid + +// + ID 4-hydroxyproline on P +AC PTM-0043 +MT UniProt +FT MOD_RES +TG P +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00039 +DR RESID; AA0030 +TR Bacteria; taxId:415003 (Microbispora sp. (strain 107891)) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID 5-glutamyl glutamate on E +AC PTM-0479 +MT UniProt +FT MOD_RES +TG E +PP Anywhere. +CF C5H7NO3 +MM 129.042593 +DR PSI-MOD; MOD:01970 +DR RESID; AA0612 +TR Archaea; taxId:2267 (Thermoproteaceae) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Isopeptide bond + +// + ID 5-glutamyl glycerylphosphorylethanolamine on E +AC PTM-0403 +MT UniProt +FT MOD_RES +TG E +PP Anywhere. +CF C5H12NO5P +MM 197.045309 +DR PSI-MOD; MOD:00179 +DR RESID; AA0170 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Phosphoprotein + +// + ID 5-hydroxylysine on K +AC PTM-0044 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00037 +DR RESID; AA0028 +TR Eukaryota; taxId:33208 (Metazoa) +KW Hydroxylation + +// + ID Acetylation on K +MT Common Biological +TG K +PP Anywhere. +CF C2H2O +MM 42.010564684 +DR Unimod; 1 +NL ETD:45.0204 +DI HCD:125.084063979 + +// + ID Acetylation on S +MT Less Common +TG S +PP Anywhere. +CF C2H2O +MM 42.010564684 +DR Unimod; 1 + +// + ID Acetylation on T +MT Less Common +TG T +PP Anywhere. +CF C2H2O +MM 42.010564684 +DR Unimod; 1 + +// + ID Acetylation on X +MT Common Biological +TG X +PP N-terminal. +CF C2H2O +MM 42.010564684 +DR Unimod; 1 + +// + ID ADP-ribosyl glutamic acid on E +AC PTM-0646 +MT UniProt +FT MOD_RES +TG E +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +TR Eukaryota; taxId:40674 (Mammalia) +KW ADP-ribosylation + +// + ID ADP-ribosylarginine on R +AC PTM-0053 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +DR PSI-MOD; MOD:00177 +DR RESID; AA0168 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW ADP-ribosylation + +// + ID ADP-ribosylcysteine on C +AC PTM-0055 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +DR PSI-MOD; MOD:00178 +DR RESID; AA0169 +TR Eukaryota; taxId:40674 (Mammalia) +KW ADP-ribosylation + +// + ID ADP-ribosylserine on S +AC PTM-0056 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +DR PSI-MOD; MOD:00242 +DR RESID; AA0237 +TR Eukaryota; taxId:9606 (Homo sapiens) +KW ADP-ribosylation + +// + ID Allysine on K +AC PTM-0059 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF H-3N-1O +MM -1.031634 +DR PSI-MOD; MOD:00130 +DR RESID; AA0121 +TR Eukaryota; taxId:6052 (Ephydatia muelleri), taxId:7742 (Vertebrata) + +// + ID Amidation on X +MT Less Common +TG X +PP Peptide C-terminal. +CF HNO-1 +MM -0.984015583 +DR Unimod; 2 + +// + ID Ammonia loss on C +MT Common Artifact +TG C +PP Peptide N-terminal. +CF H-3N-1 +MM -17.026549101 +DR Unimod; 385 + +// + ID Ammonia loss on N +MT Common Artifact +TG N +PP Anywhere. +CF H-3N-1 +MM -17.026549101 +DR Unimod; 385 + +// + ID Asymmetric dimethylarginine on R +AC PTM-0066 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF C2H4 +MM 28.0313 +DR PSI-MOD; MOD:00077 +DR RESID; AA0068 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID Calcium on D +MT Metal +TG D +PP Anywhere. +CF H-2Ca +MM 37.946940799 +DR Unimod; 951 + +// + ID Calcium on E +MT Metal +TG E +PP Anywhere. +CF H-2Ca +MM 37.946940799 +DR Unimod; 951 + +// + ID Carbamidomethyl on D +MT Less Common +TG D +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on E +MT Less Common +TG E +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on H +MT Less Common +TG H +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on K +MT Less Common +TG K +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on S +MT Less Common +TG S +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on T +MT Less Common +TG T +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on X +MT Less Common +TG X +PP Peptide N-terminal. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on Y +MT Less Common +TG Y +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamyl on C +MT Common Artifact +TG C +PP Anywhere. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carbamyl on K +MT Common Artifact +TG K +PP Anywhere. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carbamyl on M +MT Common Artifact +TG M +PP Anywhere. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carbamyl on R +MT Common Artifact +TG R +PP Anywhere. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carbamyl on X +MT Common Artifact +TG X +PP Peptide N-terminal. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carboxylation on D +MT Common Biological +TG D +PP Anywhere. +CF CO2 +MM 43.989829239 +DR Unimod; 299 + +// + ID Carboxylation on E +MT Common Biological +TG E +PP Anywhere. +CF CO2 +MM 43.989829239 +DR Unimod; 299 + +// + ID Carboxylation on K +MT Common Biological +TG K +PP Anywhere. +CF CO2 +MM 43.989829239 +DR Unimod; 299 + +// + ID Carboxymethylation on K +MT Less Common +TG K +PP Anywhere. +CF C2H2O2 +MM 58.005479304 +DR Unimod; 6 + +// + ID Carboxymethylation on W +MT Less Common +TG W +PP Anywhere. +CF C2H2O2 +MM 58.005479304 +DR Unimod; 6 + +// + ID Carboxymethylation on X +MT Less Common +TG X +PP Peptide N-terminal. +CF C2H2O2 +MM 58.005479304 +DR Unimod; 6 + +// + ID Citrullination on R +MT Common Biological +TG R +PP Anywhere. +CF H-1N-1O +MM 0.984015583 +DR Unimod; 7 +NL HCD:43.0058 +DI HCD:129.090223533 + +// + ID Citrulline on R +AC PTM-0092 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF H-1N-1O +MM 0.984016 +DR PSI-MOD; MOD:00219 +DR RESID; AA0214 +TR Eukaryota; taxId:7742 (Vertebrata) +KW Citrullination + +// + ID Crotonylation on K +MT Common Biological +TG K +PP Anywhere. +CF C4H4O +MM 68.026214748 +DR Unimod; 1363 +DI HCD:151.099723533 + +// + ID Cu[I] on D +MT Metal +TG D +PP Anywhere. +CF H-1Cu +MM 61.921772688 +DR Unimod; 531 + +// + ID Cu[I] on E +MT Metal +TG E +PP Anywhere. +CF H-1Cu +MM 61.921772688 +DR Unimod; 531 + +// + ID Cysteine methyl ester on C +AC PTM-0105 +MT UniProt +FT MOD_RES +TG C +PP C-terminal. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00114 +DR RESID; AA0105 +TR Bacteria; taxId:201174 (Actinobacteria) +TR Eukaryota; taxId:4751 (Fungi), taxId:33208 (Metazoa) +KW Methylation + +// + ID Cysteine persulfide on C +AC PTM-0106 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF S +MM 31.972071 +DR PSI-MOD; MOD:00274 +DR RESID; AA0269 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID Cysteine sulfenic acid (-SOH) on C +AC PTM-0107 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00210 +DR RESID; AA0205 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:40674 (Mammalia) +KW Oxidation + +// + ID Cysteine sulfinic acid (-SO2H) on C +AC PTM-0108 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF O2 +MM 31.989829 +DR PSI-MOD; MOD:00267 +DR RESID; AA0262 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:40674 (Mammalia) +KW Oxidation + +// + ID Cysteine sulfonic acid (-SO3H) on C +AC PTM-0634 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF O3 +MM 47.984744 +DR PSI-MOD; MOD:00460 +DR RESID; AA0556 +TR Eukaryota; taxId:40674 (Mammalia) +KW Oxidation + +// + ID Deamidated asparagine on N +AC PTM-0116 +MT UniProt +FT MOD_RES +TG N +PP Anywhere. +CF H-1N-1O +MM 0.984016 +DR PSI-MOD; MOD:00684 +DR RESID; AA0004 +TR Eukaryota; taxId:3702 (Arabidopsis thaliana), taxId:7742 (Vertebrata) + +// + ID Deamidated glutamine on Q +AC PTM-0117 +MT UniProt +FT MOD_RES +TG Q +PP Anywhere. +CF H-1N-1O +MM 0.984016 +DR PSI-MOD; MOD:00685 +DR RESID; AA0006 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:7742 (Vertebrata) + +// + ID Deamidation on N +MT Common Artifact +TG N +PP Anywhere. +CF H-1N-1O +MM 0.984015583 +DR Unimod; 7 + +// + ID Deamidation on Q +MT Common Artifact +TG Q +PP Anywhere. +CF H-1N-1O +MM 0.984015583 +DR Unimod; 7 + +// + ID Decarboxylation on D +MT Less Common +TG D +PP Anywhere. +CF C-1H-2O-1 +MM -30.010564684 +DR Unimod; 914 + +// + ID Decarboxylation on E +MT Less Common +TG E +PP Anywhere. +CF C-1H-2O-1 +MM -30.010564684 +DR Unimod; 914 + +// + ID Dehydroalanine on S +MT Less Common +TG S +PP Anywhere. +CF H-2O-1 +MM -18.010564684 + +// + ID Dehydrobutyrine on T +MT Less Common +TG T +PP Anywhere. +CF H-2O-1 +MM -18.010564684 + +// + ID Didehydro on Y +MT Less Common +TG Y +PP Anywhere. +CF H-2 +MM -2.015650064 +DR Unimod; 401 + +// + ID Dimethylated arginine on R +AC PTM-0341 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF C2H4 +MM 28.0313 +DR PSI-MOD; MOD:00783 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID Dimethylation on N +MT Less Common +TG N +PP Anywhere. +CF C2H4 +MM 28.031300129 +DR Unimod; 36 + +// + ID Dimethylation on R +MT Common Biological +TG R +PP Anywhere. +CF C2H4 +MM 28.031300129 +DR Unimod; 36 +NL ETD:31.0422 or ETD:45.0579 + +// + ID Dioxidation on C +MT Less Common +TG C +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on F +MT Less Common +TG F +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on K +MT Less Common +TG K +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on M +MT Less Common +TG M +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on P +MT Less Common +TG P +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on R +MT Less Common +TG R +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on W +MT Less Common +TG W +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on Y +MT Less Common +TG Y +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Diphthamide on H +AC PTM-0118 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF C7H14N2O +MM 142.110613533 +DR PSI-MOD; MOD:00049 +DR RESID; AA0040 +TR Archaea; taxId:2157 (Archaea) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID Ethylation on D +MT Less Common +TG D +PP Anywhere. +CF C2H4 +MM 28.031300129 +DR Unimod; 280 + +// + ID Ethylation on X +MT Less Common +TG X +PP Peptide N-terminal. +CF C2H4 +MM 28.031300129 +DR Unimod; 280 + +// + ID Formylation on K +MT Common Biological +TG K +PP Anywhere. +CF CO +MM 27.99491462 +DR Unimod; 122 +DI HCD:111.068423533 + +// + ID Formylation on S +MT Less Common +TG S +PP Anywhere. +CF CO +MM 27.99491462 +DR Unimod; 122 + +// + ID Formylation on T +MT Less Common +TG T +PP Anywhere. +CF CO +MM 27.99491462 +DR Unimod; 122 + +// + ID Formylation on X +MT Less Common +TG X +PP Peptide N-terminal. +CF CO +MM 27.99491462 +DR Unimod; 122 + +// + ID Glycyl adenylate on G +AC PTM-0409 +MT UniProt +FT MOD_RES +TG G +PP C-terminal. +CF C10H12N5O6P +MM 329.05252 +DR PSI-MOD; MOD:01614 +DR RESID; AA0511 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Nucleotide-binding or Phosphoprotein + +// + ID Hydroxylation on K +MT Common Biological +TG K +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Hydroxylation on N +MT Common Biological +TG N +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Hydroxylation on P +MT Common Biological +TG P +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 +DI HCD:170.069143 + +// + ID Hydroxyproline on P +AC PTM-0149 +MT UniProt +FT MOD_RES +TG P +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00678 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID Hypusine on K +AC PTM-0150 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H9NO +MM 87.068414 +DR PSI-MOD; MOD:00125 +DR RESID; AA0116 +TR Archaea; taxId:2157 (Archaea) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hypusine + +// + ID Leucine methyl ester on L +AC PTM-0167 +MT UniProt +FT MOD_RES +TG L +PP C-terminal. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00304 +DR RESID; AA0299 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID Magnesium on D +MT Metal +TG D +PP Anywhere. +CF H-2Mg +MM 21.969391633 +DR Unimod; 956 + +// + ID Magnesium on E +MT Metal +TG E +PP Anywhere. +CF H-2Mg +MM 21.969391633 +DR Unimod; 956 + +// + ID Methionine (R)-sulfoxide on M +AC PTM-0480 +MT UniProt +FT MOD_RES +TG M +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00720 +DR RESID; AA0581 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Oxidation + +// + ID Methionine sulfoxide on M +AC PTM-0469 +MT UniProt +FT MOD_RES +TG M +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00719 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Oxidation + +// + ID Methylation on C +MT Less Common +TG C +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on D +MT Less Common +TG D +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on E +MT Less Common +TG E +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on H +MT Less Common +TG H +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on I +MT Less Common +TG I +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on K +MT Common Biological +TG K +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on L +MT Less Common +TG L +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on N +MT Less Common +TG N +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on Q +MT Less Common +TG Q +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on R +MT Common Biological +TG R +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on S +MT Less Common +TG S +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on T +MT Less Common +TG T +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylhistidine on H +AC PTM-0176 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00661 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Eukaryota; taxId:5791 (Physarum polycephalum), taxId:7742 (Vertebrata) +KW Methylation + +// + ID N-acetylalanine on A +AC PTM-0199 +MT UniProt +FT MOD_RES +TG A +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00050 +DR RESID; AA0041 +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylaspartate on D +AC PTM-0200 +MT UniProt +FT MOD_RES +TG D +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00051 +DR RESID; AA0042 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylcysteine on C +AC PTM-0201 +MT UniProt +FT MOD_RES +TG C +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00052 +DR RESID; AA0043 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Eukaryota; taxId:7742 (Vertebrata) +KW Acetylation + +// + ID N-acetylglutamate on E +AC PTM-0202 +MT UniProt +FT MOD_RES +TG E +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00053 +DR RESID; AA0044 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylglycine on G +AC PTM-0203 +MT UniProt +FT MOD_RES +TG G +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00055 +DR RESID; AA0046 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylmethionine on M +AC PTM-0205 +MT UniProt +FT MOD_RES +TG M +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00058 +DR RESID; AA0049 +TR Archaea; taxId:2287 (Sulfolobus solfataricus) +TR Bacteria; taxId:1270 (Micrococcus luteus) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylproline on P +AC PTM-0206 +MT UniProt +FT MOD_RES +TG P +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00059 +DR RESID; AA0050 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:33090 (Viridiplantae), taxId:40674 (Mammalia) +KW Acetylation + +// + ID N-acetylserine on S +AC PTM-0207 +MT UniProt +FT MOD_RES +TG S +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00060 +DR RESID; AA0051 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylthreonine on T +AC PTM-0208 +MT UniProt +FT MOD_RES +TG T +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00061 +DR RESID; AA0052 +TR Bacteria; taxId:90370 (Salmonella typhi) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylvaline on V +AC PTM-0210 +MT UniProt +FT MOD_RES +TG V +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00063 +DR RESID; AA0054 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:3055 (Chlamydomonas reinhardtii), taxId:33208 (Metazoa) +KW Acetylation + +// + ID N-methylproline on P +AC PTM-0219 +MT UniProt +FT MOD_RES +TG P +PP N-terminal. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00830 +DR RESID; AA0419 +TR Eukaryota; taxId:7227 (Drosophila melanogaster) +KW Methylation + +// + ID N,N-dimethylproline on P +AC PTM-0179 +MT UniProt +FT MOD_RES +TG P +PP N-terminal. +CF C2H4 +MM 28.031300533 +DR PSI-MOD; MOD:00075 +DR RESID; AA0066 +TR Eukaryota; taxId:6446 (Sipunculus nudus), taxId:7586 (Echinodermata), taxId:33682 (Euglenozoa) +KW Methylation + +// + ID N,N,N-trimethylalanine on A +AC PTM-0177 +MT UniProt +FT MOD_RES +TG A +PP N-terminal. +CF C3H6 +MM 42.046950533 +DR PSI-MOD; MOD:00071 +DR RESID; AA0062 +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:5908 (Tetrahymena pyriformis), taxId:9986 (Oryctolagus cuniculus) +KW Methylation + +// + ID N,N,N-trimethylglycine on G +AC PTM-0485 +MT UniProt +FT MOD_RES +TG G +PP N-terminal. +CF C3H7 +MM 43.054775 +DR PSI-MOD; MOD:01982 +DR RESID; AA0619 +TR Eukaryota; taxId:40674 (Mammalia) +KW Methylation + +// + ID N5-methylglutamine on Q +AC PTM-0185 +MT UniProt +FT MOD_RES +TG Q +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00080 +DR RESID; AA0071 +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:4932 (Saccharomyces cerevisiae) +KW Methylation + +// + ID N6-(2-hydroxyisobutyryl)lysine on K +AC PTM-0638 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H6O2 +MM 86.03678 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID N6-(ADP-ribosyl)lysine on K +AC PTM-0355 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +DR PSI-MOD; MOD:01399 +DR RESID; AA0476 +TR Eukaryota; taxId:10090 (Mus musculus) +KW ADP-ribosylation + +// + ID N6-(beta-hydroxybutyryl)lysine on K +AC PTM-0499 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H7O2 +MM 87.044604 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID N6-(pyridoxal phosphate)lysine on K +AC PTM-0387 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C8H8NO5P +MM 229.014009 +DR PSI-MOD; MOD:00128 +DR RESID; AA0119 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Pyridoxal phosphate + +// + ID N6-(retinylidene)lysine on K +AC PTM-0388 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C20H26 +MM 266.203451 +DR PSI-MOD; MOD:00129 +DR RESID; AA0120 +TR Archaea; taxId:28890 (Euryarchaeota), taxId:2236 (Halobacteriaceae) +TR Bacteria; taxId:1236 (Gammaproteobacteria) +TR Eukaryota; taxId:33154 (Opisthokonta) +KW Retinal protein + +// + ID N6-acetyllysine on K +AC PTM-0190 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00064 +DR RESID; AA0055 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N6-biotinyllysine on K +AC PTM-0382 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C10H14N2O2S +MM 226.077599 +DR PSI-MOD; MOD:00126 +DR RESID; AA0117 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Biotin + +// + ID N6-butyryllysine on K +AC PTM-0637 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H6O +MM 70.041865 +DR PSI-MOD; MOD:01781 +DR RESID; AA0532 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6-carboxylysine on K +AC PTM-0191 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF CO2 +MM 43.989829 +DR PSI-MOD; MOD:00123 +DR RESID; AA0114 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2763 (Rhodophyta), taxId:2830 (Haptophyceae), taxId:3027 (Cryptophyta), taxId:33090 (Viridiplantae), taxId:33634 (Stramenopiles), taxId:33682 (Euglenozoa), taxId:38254 (Glaucocystophyceae) + +// + ID N6-crotonyllysine on K +AC PTM-0475 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H4O +MM 68.026215 +DR PSI-MOD; MOD:01892 +DR RESID; AA0567 +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6-glutaryllysine on K +AC PTM-0487 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C5H6O3 +MM 114.031694 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6-lipoyllysine on K +AC PTM-0383 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C8H12OS2 +MM 188.032957 +DR PSI-MOD; MOD:00127 +DR RESID; AA0118 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Lipoyl + +// + ID N6-malonyllysine on K +AC PTM-0467 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C3H2O3 +MM 86.000394 +DR PSI-MOD; MOD:01893 +DR RESID; AA0568 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6-methyllysine on K +AC PTM-0194 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00085 +DR RESID; AA0076 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID N6-propionyllysine on K +AC PTM-0642 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C3H4O +MM 56.026215 +DR PSI-MOD; MOD:01398 +DR RESID; AA0475 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:40674 (Mammalia) + +// + ID N6-succinyllysine on K +AC PTM-0438 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H4O3 +MM 100.016044 +DR PSI-MOD; MOD:01819 +DR RESID; AA0545 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6,N6-dimethyllysine on K +AC PTM-0188 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C2H4 +MM 28.0313 +DR PSI-MOD; MOD:00084 +DR RESID; AA0075 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID N6,N6,N6-trimethyllysine on K +AC PTM-0187 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C3H6 +MM 42.046950533 +DR PSI-MOD; MOD:00083 +DR RESID; AA0074 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID Nitrated tyrosine on Y +AC PTM-0213 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF H-1NO2 +MM 44.985078 +DR PSI-MOD; MOD:01352 +TR Eukaryota; taxId:40674 (Mammalia) +KW Nitration + +// + ID O-(2-cholinephosphoryl)serine on S +AC PTM-0400 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF C5H12NO3P +MM 165.055479533 +DR PSI-MOD; MOD:01588 +DR RESID; AA0498 +TR Bacteria; taxId:206351 (Neisseriales) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Phosphoprotein + +// + ID O-(pantetheine 4'-phosphoryl)serine on S +AC PTM-0391 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF C11H21N2O6PS +MM 340.085794 +DR PSI-MOD; MOD:00159 +DR RESID; AA0150 +TR Bacteria; taxId:638 (Arsenophonus nasoniae), taxId:112 (Planctomycetales) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Phosphopantetheine or Phosphoprotein + +// + ID O-acetylserine on S +AC PTM-0232 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00369 +DR RESID; AA0364 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID Omega-N-methylarginine on R +AC PTM-0237 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00078 +DR RESID; AA0069 +TR Eukaryota; taxId:5661 (Leishmania donovani), taxId:40674 (Mammalia) +KW Methylation + +// + ID Oxidation on C +MT Less Common +TG C +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on D +MT Less Common +TG D +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on E +MT Less Common +TG E +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on F +MT Less Common +TG F +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on H +MT Less Common +TG H +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on I +MT Less Common +TG I +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on L +MT Less Common +TG L +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on Q +MT Less Common +TG Q +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on R +MT Less Common +TG R +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on S +MT Less Common +TG S +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on T +MT Less Common +TG T +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on V +MT Less Common +TG V +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on W +MT Less Common +TG W +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on Y +MT Less Common +TG Y +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation to Kynurenine on W +MT Less Common +TG W +PP Anywhere. +CF C-1O +MM 3.99491462 +DR Unimod; 351 +DI HCD:194.06914219 + +// + ID Phosphoarginine on R +AC PTM-0250 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00227 +DR RESID; AA0222 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:40674 (Mammalia) +KW Phosphoprotein + +// + ID Phosphohistidine on H +AC PTM-0252 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00890 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Phosphoprotein + +// + ID Phosphorylation on S +MT Common Biological +TG S +PP Anywhere. +CF HO3P +MM 79.966330889 +DR Unimod; 21 +NL HCD:0 or HCD:97.976895573 + +// + ID Phosphorylation on T +MT Common Biological +TG T +PP Anywhere. +CF HO3P +MM 79.966330889 +DR Unimod; 21 +NL HCD:0 or HCD:97.976895573 + +// + ID Phosphorylation on Y +MT Common Biological +TG Y +PP Anywhere. +CF HO3P +MM 79.966330889 +DR Unimod; 21 +NL HCD:0 or HCD:97.976895573 +DI HCD:215.034744803 + +// + ID Phosphoserine on S +AC PTM-0253 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00046 +DR RESID; AA0037 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +TR Viruses; taxId:10239 (Viruses) +KW Phosphoprotein + +// + ID Phosphothreonine on T +AC PTM-0254 +MT UniProt +FT MOD_RES +TG T +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00047 +DR RESID; AA0038 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +TR Viruses; taxId:10239 (Viruses) +KW Phosphoprotein + +// + ID Phosphotyrosine on Y +AC PTM-0255 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00048 +DR RESID; AA0039 +TR Archaea; taxId:2287 (Sulfolobus solfataricus) +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +TR Viruses; taxId:10239 (Viruses) +KW Phosphoprotein + +// + ID Potassium on D +MT Metal +TG D +PP Anywhere. +CF H-1K +MM 37.955881454 +DR Unimod; 530 + +// + ID Potassium on E +MT Metal +TG E +PP Anywhere. +CF H-1K +MM 37.955881454 +DR Unimod; 530 + +// + ID Proline pyrrole to pyrrolidine six member ring on P +MT Less Common +TG P +PP Anywhere. +CF C +MM 12 + +// + ID Propionamidation on C +MT Less Common +TG C +PP Anywhere. +CF C3H5NO +MM 71.037113785 +DR Unimod; 24 + +// + ID Propionamidation on K +MT Less Common +TG K +PP Anywhere. +CF C3H5NO +MM 71.037113785 +DR Unimod; 24 + +// + ID Propionamidation on X +MT Less Common +TG X +PP Peptide N-terminal. +CF C3H5NO +MM 71.037113785 +DR Unimod; 24 + +// + ID Propionylation on K +MT Less Common +TG K +PP Anywhere. +CF C3H4O +MM 56.026214748 +DR Unimod; 58 +DI HCD:139.099823533 + +// + ID Pros-8alpha-FAD histidine on H +AC PTM-0258 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF C27H31N9O15P2 +MM 783.141485 +DR PSI-MOD; MOD:00153 +DR RESID; AA0144 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW FAD + +// + ID Pros-methylhistidine on H +AC PTM-0259 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00082 +DR RESID; AA0073 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Eukaryota; taxId:7742 (Vertebrata) +KW Methylation + +// + ID Pyrrolidinone on P +MT Less Common +TG P +PP Anywhere. +CF C-1H-2O-1 +MM -30.010564684 +DR Unimod; 360 + +// + ID Pyrrolidone carboxylic acid on Q +AC PTM-0261 +MT UniProt +FT MOD_RES +TG Q +PP N-terminal. +CF H-3N-1 +MM -17.026549 +DR PSI-MOD; MOD:00040 +DR RESID; AA0031 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Pyrrolidone carboxylic acid + +// + ID Reduction on D +MT Less Common +TG D +PP Anywhere. +CF O-1 +MM -15.99491462 +DR Unimod; 447 + +// + ID Reduction on S +MT Less Common +TG S +PP Anywhere. +CF O-1 +MM -15.99491462 +DR Unimod; 447 + +// + ID Reduction on T +MT Less Common +TG T +PP Anywhere. +CF O-1 +MM -15.99491462 +DR Unimod; 447 + +// + ID S-(dipyrrolylmethanemethyl)cysteine on C +AC PTM-0421 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C20H22N2O8 +MM 418.137616 +DR PSI-MOD; MOD:00257 +DR RESID; AA0252 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID S-8alpha-FAD cysteine on C +AC PTM-0272 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C27H31N9O15P2 +MM 783.141485 +DR PSI-MOD; MOD:00152 +DR RESID; AA0143 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW FAD + +// + ID S-cysteinyl cysteine on C +AC PTM-0415 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C3H5NO2S +MM 119.004099 +DR PSI-MOD; MOD:00765 +DR RESID; AA0025 +TR Bacteria; taxId:91347 (Enterobacterales) +TR Eukaryota; taxId:40674 (Mammalia) + +// + ID S-glutathionyl cysteine on C +AC PTM-0311 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C10H15N3O6S +MM 305.068156 +DR PSI-MOD; MOD:00234 +DR RESID; AA0229 +TR Bacteria; taxId:83333 (Escherichia coli (strain K12)) +TR Eukaryota; taxId:3981 (Hevea brasiliensis), taxId:7742 (Vertebrata) +KW Glutathionylation + +// + ID S-methylcysteine on C +AC PTM-0279 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00239 +DR RESID; AA0234 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:1421 (Lysinibacillus sphaericus) +TR Eukaryota; taxId:3055 (Chlamydomonas reinhardtii) +KW Methylation + +// + ID S-nitrosocysteine on C +AC PTM-0280 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF H-1NO +MM 28.990164 +DR PSI-MOD; MOD:00235 +DR RESID; AA0230 +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:40674 (Mammalia) +KW S-nitrosylation + +// + ID Sodium on D +MT Metal +TG D +PP Anywhere. +CF H-1Na +MM 21.98194425 +DR Unimod; 30 + +// + ID Sodium on E +MT Metal +TG E +PP Anywhere. +CF H-1Na +MM 21.98194425 +DR Unimod; 30 + +// + ID Sulfonation on S +MT Less Common +TG S +PP Anywhere. +CF O3S +MM 79.956815033 +DR Unimod; 40 +NL AnyActivationType:79.956815033 + +// + ID Sulfonation on T +MT Less Common +TG T +PP Anywhere. +CF O3S +MM 79.956815033 +DR Unimod; 40 +NL AnyActivationType:79.956815033 + +// + ID Sulfonation on Y +MT Common Biological +TG Y +PP Anywhere. +CF O3S +MM 79.956815033 +DR Unimod; 40 +NL AnyActivationType:79.956815033 + +// + ID Sulfotyrosine on Y +AC PTM-0286 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF O3S +MM 79.956815 +DR PSI-MOD; MOD:00181 +DR RESID; AA0172 +TR Eukaryota; taxId:33208 (Metazoa), taxId:33090 (Viridiplantae) +KW Sulfation + +// + ID Symmetric dimethylarginine on R +AC PTM-0287 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF C2H4 +MM 28.0313 +DR PSI-MOD; MOD:00076 +DR RESID; AA0067 +TR Eukaryota; taxId:7742 (Vertebrata) +KW Methylation + +// + ID Tele-8alpha-FAD histidine on H +AC PTM-0288 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF C27H31N9O15P2 +MM 783.141485 +DR PSI-MOD; MOD:00226 +DR RESID; AA0221 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW FAD + +// + ID Tele-methylhistidine on H +AC PTM-0290 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00322 +DR RESID; AA0317 +TR Eukaryota; taxId:5791 (Physarum polycephalum), taxId:7742 (Vertebrata) +KW Methylation + +// + ID Thyroxine on Y +AC PTM-0294 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF C6I4O +MM 595.612805 +DR PSI-MOD; MOD:00187 +DR RESID; AA0178 +TR Eukaryota; taxId:40674 (Mammalia) +KW Iodination + +// + ID Triiodothyronine on Y +AC PTM-0295 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF C6HI3O +MM 469.716158 +DR PSI-MOD; MOD:00186 +DR RESID; AA0177 +TR Eukaryota; taxId:40674 (Mammalia) +KW Iodination + +// + ID Trioxidation on C +MT Less Common +TG C +PP Anywhere. +CF O3 +MM 47.984743859 +DR Unimod; 345 + +// + ID Water loss on D +MT Less Common +TG D +PP Anywhere. +CF H-2O-1 +MM -18.010564684 +DR Unimod; 23 + +// + ID Water Loss on E +MT Common Artifact +TG E +PP Peptide N-terminal. +CF H-2O-1 +MM -18.010564684 +DR Unimod; 23 + +// + ID Zinc on D +MT Metal +TG D +PP Anywhere. +CF H-2Zn +MM 61.913491946 +DR Unimod; 954 + +// + ID Zinc on E +MT Metal +TG E +PP Anywhere. +CF H-2Zn +MM 61.913491946 +DR Unimod; 954 + +// + + P63260 + ACTG_MOUSE + + + Actin, cytoplasmic 2 + + + + Actg1 + Actg + + + Mus musculus + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNERFRCPEALFQPSFLGMESCGIHETTFNSIMKCDVDIRKDLYANTVLSGGTTMYPGIADRMQKEITALAPSTMKIKIIAPPERKYSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHRKCF + + \ No newline at end of file diff --git a/mzLib/Test/DatabaseTests/SingleEntry_ModOrder2.xml b/mzLib/Test/DatabaseTests/SingleEntry_ModOrder2.xml new file mode 100644 index 000000000..1bd37acd1 --- /dev/null +++ b/mzLib/Test/DatabaseTests/SingleEntry_ModOrder2.xml @@ -0,0 +1,3617 @@ + + + ID (3R)-3-hydroxyasparagine on N +AC PTM-0369 +MT UniProt +FT MOD_RES +TG N +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00035 +DR RESID; AA0026 +TR Eukaryota; taxId:40674 (Mammalia) +KW Hydroxylation + +// + ID (3R)-3-hydroxyaspartate on D +AC PTM-0371 +MT UniProt +FT MOD_RES +TG D +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00036 +DR RESID; AA0027 +TR Bacteria; taxId:68215 (Streptomyces griseoverticillatus) +TR Eukaryota; taxId:40674 (Mammalia) +KW Hydroxylation + +// + ID (3S)-3-hydroxyasparagine on N +AC PTM-0370 +MT UniProt +FT MOD_RES +TG N +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:01401 +DR RESID; AA0478 +TR Eukaryota; taxId:7742 (Vertebrata) +KW Hydroxylation + +// + ID (3S)-3-hydroxyaspartate on D +AC PTM-0473 +MT UniProt +FT MOD_RES +TG D +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:01919 +TR Eukaryota; taxId:33208 (Metazoa) +KW Hydroxylation + +// + ID (3S)-3-hydroxyhistidine on H +AC PTM-0477 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:01920 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID (4R)-5-hydroxyleucine on L +AC PTM-0491 +MT UniProt +FT MOD_RES +TG L +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:01373 +DR RESID; AA0443 +TR Eukaryota; taxId:33208 (Metazoa) +KW Hydroxylation + +// + ID (4R)-5-oxoleucine on L +AC PTM-0492 +MT UniProt +FT MOD_RES +TG L +PP Anywhere. +CF H-2O +MM 13.979265 +DR PSI-MOD; MOD:01374 +DR RESID; AA0444 +TR Eukaryota; taxId:33208 (Metazoa) +KW Oxidation + +// + ID 2',4',5'-topaquinone on Y +AC PTM-0009 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF H-2O2 +MM 29.974179 +DR PSI-MOD; MOD:00156 +DR RESID; AA0147 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW TPQ + +// + ID 3-hydroxyasparagine on N +AC PTM-0028 +MT UniProt +FT MOD_RES +TG N +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00035 +DR RESID; AA0026 +TR Eukaryota; taxId:7742 (Vertebrata) +KW Hydroxylation + +// + ID 3-hydroxyproline on P +AC PTM-0030 +MT UniProt +FT MOD_RES +TG P +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00038 +DR RESID; AA0029 +TR Eukaryota; taxId:33208 (Metazoa) +KW Hydroxylation + +// + ID 3-oxoalanine (Cys) on C +AC PTM-0033 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF H-2OS-1 +MM -17.992806 +DR PSI-MOD; MOD:00193 +DR RESID; AA0185 +TR Bacteria; taxId:1224 (Proteobacteria), taxId:1239 (Firmicutes) +TR Eukaryota; taxId:3041 (Chlorophyta), taxId:33208 (Metazoa) + +// + ID 3'-nitrotyrosine on Y +AC PTM-0434 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF H-1NO2 +MM 44.985078 +DR PSI-MOD; MOD:01786 +DR RESID; AA0537 +TR Eukaryota; taxId:40674 (Mammalia) +KW Nitration + +// + ID 4-carboxyglutamate on E +AC PTM-0039 +MT UniProt +FT MOD_RES +TG E +PP Anywhere. +CF CO2 +MM 43.989829 +DR PSI-MOD; MOD:00041 +DR RESID; AA0032 +TR Eukaryota; taxId:6447 (Mollusca), taxId:7742 (Vertebrata) +KW Gamma-carboxyglutamic acid + +// + ID 4-hydroxyproline on P +AC PTM-0043 +MT UniProt +FT MOD_RES +TG P +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00039 +DR RESID; AA0030 +TR Bacteria; taxId:415003 (Microbispora sp. (strain 107891)) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID 5-glutamyl glutamate on E +AC PTM-0479 +MT UniProt +FT MOD_RES +TG E +PP Anywhere. +CF C5H7NO3 +MM 129.042593 +DR PSI-MOD; MOD:01970 +DR RESID; AA0612 +TR Archaea; taxId:2267 (Thermoproteaceae) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Isopeptide bond + +// + ID 5-glutamyl glycerylphosphorylethanolamine on E +AC PTM-0403 +MT UniProt +FT MOD_RES +TG E +PP Anywhere. +CF C5H12NO5P +MM 197.045309 +DR PSI-MOD; MOD:00179 +DR RESID; AA0170 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Phosphoprotein + +// + ID 5-hydroxylysine on K +AC PTM-0044 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00037 +DR RESID; AA0028 +TR Eukaryota; taxId:33208 (Metazoa) +KW Hydroxylation + +// + ID Acetylation on K +MT Common Biological +TG K +PP Anywhere. +CF C2H2O +MM 42.010564684 +DR Unimod; 1 +NL ETD:45.0204 +DI HCD:125.084063979 + +// + ID Acetylation on S +MT Less Common +TG S +PP Anywhere. +CF C2H2O +MM 42.010564684 +DR Unimod; 1 + +// + ID Acetylation on T +MT Less Common +TG T +PP Anywhere. +CF C2H2O +MM 42.010564684 +DR Unimod; 1 + +// + ID Acetylation on X +MT Common Biological +TG X +PP N-terminal. +CF C2H2O +MM 42.010564684 +DR Unimod; 1 + +// + ID ADP-ribosyl glutamic acid on E +AC PTM-0646 +MT UniProt +FT MOD_RES +TG E +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +TR Eukaryota; taxId:40674 (Mammalia) +KW ADP-ribosylation + +// + ID ADP-ribosylarginine on R +AC PTM-0053 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +DR PSI-MOD; MOD:00177 +DR RESID; AA0168 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW ADP-ribosylation + +// + ID ADP-ribosylcysteine on C +AC PTM-0055 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +DR PSI-MOD; MOD:00178 +DR RESID; AA0169 +TR Eukaryota; taxId:40674 (Mammalia) +KW ADP-ribosylation + +// + ID ADP-ribosylserine on S +AC PTM-0056 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +DR PSI-MOD; MOD:00242 +DR RESID; AA0237 +TR Eukaryota; taxId:9606 (Homo sapiens) +KW ADP-ribosylation + +// + ID Allysine on K +AC PTM-0059 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF H-3N-1O +MM -1.031634 +DR PSI-MOD; MOD:00130 +DR RESID; AA0121 +TR Eukaryota; taxId:6052 (Ephydatia muelleri), taxId:7742 (Vertebrata) + +// + ID Amidation on X +MT Less Common +TG X +PP Peptide C-terminal. +CF HNO-1 +MM -0.984015583 +DR Unimod; 2 + +// + ID Ammonia loss on C +MT Common Artifact +TG C +PP Peptide N-terminal. +CF H-3N-1 +MM -17.026549101 +DR Unimod; 385 + +// + ID Ammonia loss on N +MT Common Artifact +TG N +PP Anywhere. +CF H-3N-1 +MM -17.026549101 +DR Unimod; 385 + +// + ID Asymmetric dimethylarginine on R +AC PTM-0066 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF C2H4 +MM 28.0313 +DR PSI-MOD; MOD:00077 +DR RESID; AA0068 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID Calcium on D +MT Metal +TG D +PP Anywhere. +CF H-2Ca +MM 37.946940799 +DR Unimod; 951 + +// + ID Calcium on E +MT Metal +TG E +PP Anywhere. +CF H-2Ca +MM 37.946940799 +DR Unimod; 951 + +// + ID Carbamidomethyl on D +MT Less Common +TG D +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on E +MT Less Common +TG E +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on H +MT Less Common +TG H +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on K +MT Less Common +TG K +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on S +MT Less Common +TG S +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on T +MT Less Common +TG T +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on X +MT Less Common +TG X +PP Peptide N-terminal. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamidomethyl on Y +MT Less Common +TG Y +PP Anywhere. +CF C2H3NO +MM 57.021463721 +DR Unimod; 4 + +// + ID Carbamyl on C +MT Common Artifact +TG C +PP Anywhere. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carbamyl on K +MT Common Artifact +TG K +PP Anywhere. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carbamyl on M +MT Common Artifact +TG M +PP Anywhere. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carbamyl on R +MT Common Artifact +TG R +PP Anywhere. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carbamyl on X +MT Common Artifact +TG X +PP Peptide N-terminal. +CF CHNO +MM 43.005813656 +DR Unimod; 5 + +// + ID Carboxylation on D +MT Common Biological +TG D +PP Anywhere. +CF CO2 +MM 43.989829239 +DR Unimod; 299 + +// + ID Carboxylation on E +MT Common Biological +TG E +PP Anywhere. +CF CO2 +MM 43.989829239 +DR Unimod; 299 + +// + ID Carboxylation on K +MT Common Biological +TG K +PP Anywhere. +CF CO2 +MM 43.989829239 +DR Unimod; 299 + +// + ID Carboxymethylation on K +MT Less Common +TG K +PP Anywhere. +CF C2H2O2 +MM 58.005479304 +DR Unimod; 6 + +// + ID Carboxymethylation on W +MT Less Common +TG W +PP Anywhere. +CF C2H2O2 +MM 58.005479304 +DR Unimod; 6 + +// + ID Carboxymethylation on X +MT Less Common +TG X +PP Peptide N-terminal. +CF C2H2O2 +MM 58.005479304 +DR Unimod; 6 + +// + ID Citrullination on R +MT Common Biological +TG R +PP Anywhere. +CF H-1N-1O +MM 0.984015583 +DR Unimod; 7 +NL HCD:43.0058 +DI HCD:129.090223533 + +// + ID Citrulline on R +AC PTM-0092 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF H-1N-1O +MM 0.984016 +DR PSI-MOD; MOD:00219 +DR RESID; AA0214 +TR Eukaryota; taxId:7742 (Vertebrata) +KW Citrullination + +// + ID Crotonylation on K +MT Common Biological +TG K +PP Anywhere. +CF C4H4O +MM 68.026214748 +DR Unimod; 1363 +DI HCD:151.099723533 + +// + ID Cu[I] on D +MT Metal +TG D +PP Anywhere. +CF H-1Cu +MM 61.921772688 +DR Unimod; 531 + +// + ID Cu[I] on E +MT Metal +TG E +PP Anywhere. +CF H-1Cu +MM 61.921772688 +DR Unimod; 531 + +// + ID Cysteine methyl ester on C +AC PTM-0105 +MT UniProt +FT MOD_RES +TG C +PP C-terminal. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00114 +DR RESID; AA0105 +TR Bacteria; taxId:201174 (Actinobacteria) +TR Eukaryota; taxId:4751 (Fungi), taxId:33208 (Metazoa) +KW Methylation + +// + ID Cysteine persulfide on C +AC PTM-0106 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF S +MM 31.972071 +DR PSI-MOD; MOD:00274 +DR RESID; AA0269 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID Cysteine sulfenic acid (-SOH) on C +AC PTM-0107 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00210 +DR RESID; AA0205 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:40674 (Mammalia) +KW Oxidation + +// + ID Cysteine sulfinic acid (-SO2H) on C +AC PTM-0108 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF O2 +MM 31.989829 +DR PSI-MOD; MOD:00267 +DR RESID; AA0262 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:40674 (Mammalia) +KW Oxidation + +// + ID Cysteine sulfonic acid (-SO3H) on C +AC PTM-0634 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF O3 +MM 47.984744 +DR PSI-MOD; MOD:00460 +DR RESID; AA0556 +TR Eukaryota; taxId:40674 (Mammalia) +KW Oxidation + +// + ID Deamidated asparagine on N +AC PTM-0116 +MT UniProt +FT MOD_RES +TG N +PP Anywhere. +CF H-1N-1O +MM 0.984016 +DR PSI-MOD; MOD:00684 +DR RESID; AA0004 +TR Eukaryota; taxId:3702 (Arabidopsis thaliana), taxId:7742 (Vertebrata) + +// + ID Deamidated glutamine on Q +AC PTM-0117 +MT UniProt +FT MOD_RES +TG Q +PP Anywhere. +CF H-1N-1O +MM 0.984016 +DR PSI-MOD; MOD:00685 +DR RESID; AA0006 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:7742 (Vertebrata) + +// + ID Deamidation on N +MT Common Artifact +TG N +PP Anywhere. +CF H-1N-1O +MM 0.984015583 +DR Unimod; 7 + +// + ID Deamidation on Q +MT Common Artifact +TG Q +PP Anywhere. +CF H-1N-1O +MM 0.984015583 +DR Unimod; 7 + +// + ID Decarboxylation on D +MT Less Common +TG D +PP Anywhere. +CF C-1H-2O-1 +MM -30.010564684 +DR Unimod; 914 + +// + ID Decarboxylation on E +MT Less Common +TG E +PP Anywhere. +CF C-1H-2O-1 +MM -30.010564684 +DR Unimod; 914 + +// + ID Dehydroalanine on S +MT Less Common +TG S +PP Anywhere. +CF H-2O-1 +MM -18.010564684 + +// + ID Dehydrobutyrine on T +MT Less Common +TG T +PP Anywhere. +CF H-2O-1 +MM -18.010564684 + +// + ID Didehydro on Y +MT Less Common +TG Y +PP Anywhere. +CF H-2 +MM -2.015650064 +DR Unimod; 401 + +// + ID Dimethylated arginine on R +AC PTM-0341 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF C2H4 +MM 28.0313 +DR PSI-MOD; MOD:00783 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID Dimethylation on N +MT Less Common +TG N +PP Anywhere. +CF C2H4 +MM 28.031300129 +DR Unimod; 36 + +// + ID Dimethylation on R +MT Common Biological +TG R +PP Anywhere. +CF C2H4 +MM 28.031300129 +DR Unimod; 36 +NL ETD:31.0422 or ETD:45.0579 + +// + ID Dioxidation on C +MT Less Common +TG C +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on F +MT Less Common +TG F +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on K +MT Less Common +TG K +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on M +MT Less Common +TG M +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on P +MT Less Common +TG P +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on R +MT Less Common +TG R +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on W +MT Less Common +TG W +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Dioxidation on Y +MT Less Common +TG Y +PP Anywhere. +CF O2 +MM 31.989829239 +DR Unimod; 425 + +// + ID Diphthamide on H +AC PTM-0118 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF C7H14N2O +MM 142.110613533 +DR PSI-MOD; MOD:00049 +DR RESID; AA0040 +TR Archaea; taxId:2157 (Archaea) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID Ethylation on D +MT Less Common +TG D +PP Anywhere. +CF C2H4 +MM 28.031300129 +DR Unimod; 280 + +// + ID Ethylation on X +MT Less Common +TG X +PP Peptide N-terminal. +CF C2H4 +MM 28.031300129 +DR Unimod; 280 + +// + ID Formylation on K +MT Common Biological +TG K +PP Anywhere. +CF CO +MM 27.99491462 +DR Unimod; 122 +DI HCD:111.068423533 + +// + ID Formylation on S +MT Less Common +TG S +PP Anywhere. +CF CO +MM 27.99491462 +DR Unimod; 122 + +// + ID Formylation on T +MT Less Common +TG T +PP Anywhere. +CF CO +MM 27.99491462 +DR Unimod; 122 + +// + ID Formylation on X +MT Less Common +TG X +PP Peptide N-terminal. +CF CO +MM 27.99491462 +DR Unimod; 122 + +// + ID Glycyl adenylate on G +AC PTM-0409 +MT UniProt +FT MOD_RES +TG G +PP C-terminal. +CF C10H12N5O6P +MM 329.05252 +DR PSI-MOD; MOD:01614 +DR RESID; AA0511 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Nucleotide-binding or Phosphoprotein + +// + ID Hydroxylation on K +MT Common Biological +TG K +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Hydroxylation on N +MT Common Biological +TG N +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Hydroxylation on P +MT Common Biological +TG P +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 +DI HCD:170.069143 + +// + ID Hydroxyproline on P +AC PTM-0149 +MT UniProt +FT MOD_RES +TG P +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00678 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID Hypusine on K +AC PTM-0150 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H9NO +MM 87.068414 +DR PSI-MOD; MOD:00125 +DR RESID; AA0116 +TR Archaea; taxId:2157 (Archaea) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hypusine + +// + ID Leucine methyl ester on L +AC PTM-0167 +MT UniProt +FT MOD_RES +TG L +PP C-terminal. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00304 +DR RESID; AA0299 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID Magnesium on D +MT Metal +TG D +PP Anywhere. +CF H-2Mg +MM 21.969391633 +DR Unimod; 956 + +// + ID Magnesium on E +MT Metal +TG E +PP Anywhere. +CF H-2Mg +MM 21.969391633 +DR Unimod; 956 + +// + ID Methionine (R)-sulfoxide on M +AC PTM-0480 +MT UniProt +FT MOD_RES +TG M +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00720 +DR RESID; AA0581 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Oxidation + +// + ID Methionine sulfoxide on M +AC PTM-0469 +MT UniProt +FT MOD_RES +TG M +PP Anywhere. +CF O +MM 15.994915 +DR PSI-MOD; MOD:00719 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Oxidation + +// + ID Methylation on C +MT Less Common +TG C +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on D +MT Less Common +TG D +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on E +MT Less Common +TG E +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on H +MT Less Common +TG H +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on I +MT Less Common +TG I +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on K +MT Common Biological +TG K +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on L +MT Less Common +TG L +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on N +MT Less Common +TG N +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on Q +MT Less Common +TG Q +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on R +MT Common Biological +TG R +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on S +MT Less Common +TG S +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylation on T +MT Less Common +TG T +PP Anywhere. +CF CH2 +MM 14.015650064 +DR Unimod; 34 + +// + ID Methylhistidine on H +AC PTM-0176 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00661 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Eukaryota; taxId:5791 (Physarum polycephalum), taxId:7742 (Vertebrata) +KW Methylation + +// + ID N-acetylalanine on A +AC PTM-0199 +MT UniProt +FT MOD_RES +TG A +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00050 +DR RESID; AA0041 +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylaspartate on D +AC PTM-0200 +MT UniProt +FT MOD_RES +TG D +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00051 +DR RESID; AA0042 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylcysteine on C +AC PTM-0201 +MT UniProt +FT MOD_RES +TG C +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00052 +DR RESID; AA0043 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Eukaryota; taxId:7742 (Vertebrata) +KW Acetylation + +// + ID N-acetylglutamate on E +AC PTM-0202 +MT UniProt +FT MOD_RES +TG E +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00053 +DR RESID; AA0044 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylglycine on G +AC PTM-0203 +MT UniProt +FT MOD_RES +TG G +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00055 +DR RESID; AA0046 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylmethionine on M +AC PTM-0205 +MT UniProt +FT MOD_RES +TG M +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00058 +DR RESID; AA0049 +TR Archaea; taxId:2287 (Sulfolobus solfataricus) +TR Bacteria; taxId:1270 (Micrococcus luteus) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylproline on P +AC PTM-0206 +MT UniProt +FT MOD_RES +TG P +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00059 +DR RESID; AA0050 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:33090 (Viridiplantae), taxId:40674 (Mammalia) +KW Acetylation + +// + ID N-acetylserine on S +AC PTM-0207 +MT UniProt +FT MOD_RES +TG S +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00060 +DR RESID; AA0051 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylthreonine on T +AC PTM-0208 +MT UniProt +FT MOD_RES +TG T +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00061 +DR RESID; AA0052 +TR Bacteria; taxId:90370 (Salmonella typhi) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N-acetylvaline on V +AC PTM-0210 +MT UniProt +FT MOD_RES +TG V +PP N-terminal. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00063 +DR RESID; AA0054 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:3055 (Chlamydomonas reinhardtii), taxId:33208 (Metazoa) +KW Acetylation + +// + ID N-methylproline on P +AC PTM-0219 +MT UniProt +FT MOD_RES +TG P +PP N-terminal. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00830 +DR RESID; AA0419 +TR Eukaryota; taxId:7227 (Drosophila melanogaster) +KW Methylation + +// + ID N,N-dimethylproline on P +AC PTM-0179 +MT UniProt +FT MOD_RES +TG P +PP N-terminal. +CF C2H4 +MM 28.031300533 +DR PSI-MOD; MOD:00075 +DR RESID; AA0066 +TR Eukaryota; taxId:6446 (Sipunculus nudus), taxId:7586 (Echinodermata), taxId:33682 (Euglenozoa) +KW Methylation + +// + ID N,N,N-trimethylalanine on A +AC PTM-0177 +MT UniProt +FT MOD_RES +TG A +PP N-terminal. +CF C3H6 +MM 42.046950533 +DR PSI-MOD; MOD:00071 +DR RESID; AA0062 +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:5908 (Tetrahymena pyriformis), taxId:9986 (Oryctolagus cuniculus) +KW Methylation + +// + ID N,N,N-trimethylglycine on G +AC PTM-0485 +MT UniProt +FT MOD_RES +TG G +PP N-terminal. +CF C3H7 +MM 43.054775 +DR PSI-MOD; MOD:01982 +DR RESID; AA0619 +TR Eukaryota; taxId:40674 (Mammalia) +KW Methylation + +// + ID N5-methylglutamine on Q +AC PTM-0185 +MT UniProt +FT MOD_RES +TG Q +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00080 +DR RESID; AA0071 +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:4932 (Saccharomyces cerevisiae) +KW Methylation + +// + ID N6-(2-hydroxyisobutyryl)lysine on K +AC PTM-0638 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H6O2 +MM 86.03678 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID N6-(ADP-ribosyl)lysine on K +AC PTM-0355 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C15H21N5O13P2 +MM 541.061109 +DR PSI-MOD; MOD:01399 +DR RESID; AA0476 +TR Eukaryota; taxId:10090 (Mus musculus) +KW ADP-ribosylation + +// + ID N6-(beta-hydroxybutyryl)lysine on K +AC PTM-0499 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H7O2 +MM 87.044604 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Hydroxylation + +// + ID N6-(pyridoxal phosphate)lysine on K +AC PTM-0387 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C8H8NO5P +MM 229.014009 +DR PSI-MOD; MOD:00128 +DR RESID; AA0119 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Pyridoxal phosphate + +// + ID N6-(retinylidene)lysine on K +AC PTM-0388 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C20H26 +MM 266.203451 +DR PSI-MOD; MOD:00129 +DR RESID; AA0120 +TR Archaea; taxId:28890 (Euryarchaeota), taxId:2236 (Halobacteriaceae) +TR Bacteria; taxId:1236 (Gammaproteobacteria) +TR Eukaryota; taxId:33154 (Opisthokonta) +KW Retinal protein + +// + ID N6-acetyllysine on K +AC PTM-0190 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00064 +DR RESID; AA0055 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID N6-biotinyllysine on K +AC PTM-0382 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C10H14N2O2S +MM 226.077599 +DR PSI-MOD; MOD:00126 +DR RESID; AA0117 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Biotin + +// + ID N6-butyryllysine on K +AC PTM-0637 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H6O +MM 70.041865 +DR PSI-MOD; MOD:01781 +DR RESID; AA0532 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6-carboxylysine on K +AC PTM-0191 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF CO2 +MM 43.989829 +DR PSI-MOD; MOD:00123 +DR RESID; AA0114 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2763 (Rhodophyta), taxId:2830 (Haptophyceae), taxId:3027 (Cryptophyta), taxId:33090 (Viridiplantae), taxId:33634 (Stramenopiles), taxId:33682 (Euglenozoa), taxId:38254 (Glaucocystophyceae) + +// + ID N6-crotonyllysine on K +AC PTM-0475 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H4O +MM 68.026215 +DR PSI-MOD; MOD:01892 +DR RESID; AA0567 +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6-glutaryllysine on K +AC PTM-0487 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C5H6O3 +MM 114.031694 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6-lipoyllysine on K +AC PTM-0383 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C8H12OS2 +MM 188.032957 +DR PSI-MOD; MOD:00127 +DR RESID; AA0118 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Lipoyl + +// + ID N6-malonyllysine on K +AC PTM-0467 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C3H2O3 +MM 86.000394 +DR PSI-MOD; MOD:01893 +DR RESID; AA0568 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6-methyllysine on K +AC PTM-0194 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00085 +DR RESID; AA0076 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID N6-propionyllysine on K +AC PTM-0642 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C3H4O +MM 56.026215 +DR PSI-MOD; MOD:01398 +DR RESID; AA0475 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:40674 (Mammalia) + +// + ID N6-succinyllysine on K +AC PTM-0438 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C4H4O3 +MM 100.016044 +DR PSI-MOD; MOD:01819 +DR RESID; AA0545 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID N6,N6-dimethyllysine on K +AC PTM-0188 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C2H4 +MM 28.0313 +DR PSI-MOD; MOD:00084 +DR RESID; AA0075 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID N6,N6,N6-trimethyllysine on K +AC PTM-0187 +MT UniProt +FT MOD_RES +TG K +PP Anywhere. +CF C3H6 +MM 42.046950533 +DR PSI-MOD; MOD:00083 +DR RESID; AA0074 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Methylation + +// + ID Nitrated tyrosine on Y +AC PTM-0213 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF H-1NO2 +MM 44.985078 +DR PSI-MOD; MOD:01352 +TR Eukaryota; taxId:40674 (Mammalia) +KW Nitration + +// + ID O-(2-cholinephosphoryl)serine on S +AC PTM-0400 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF C5H12NO3P +MM 165.055479533 +DR PSI-MOD; MOD:01588 +DR RESID; AA0498 +TR Bacteria; taxId:206351 (Neisseriales) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Phosphoprotein + +// + ID O-(pantetheine 4'-phosphoryl)serine on S +AC PTM-0391 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF C11H21N2O6PS +MM 340.085794 +DR PSI-MOD; MOD:00159 +DR RESID; AA0150 +TR Bacteria; taxId:638 (Arsenophonus nasoniae), taxId:112 (Planctomycetales) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Phosphopantetheine or Phosphoprotein + +// + ID O-acetylserine on S +AC PTM-0232 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF C2H2O +MM 42.010565 +DR PSI-MOD; MOD:00369 +DR RESID; AA0364 +TR Eukaryota; taxId:2759 (Eukaryota) +KW Acetylation + +// + ID Omega-N-methylarginine on R +AC PTM-0237 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00078 +DR RESID; AA0069 +TR Eukaryota; taxId:5661 (Leishmania donovani), taxId:40674 (Mammalia) +KW Methylation + +// + ID Oxidation on C +MT Less Common +TG C +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on D +MT Less Common +TG D +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on E +MT Less Common +TG E +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on F +MT Less Common +TG F +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on H +MT Less Common +TG H +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on I +MT Less Common +TG I +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on L +MT Less Common +TG L +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on Q +MT Less Common +TG Q +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on R +MT Less Common +TG R +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on S +MT Less Common +TG S +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on T +MT Less Common +TG T +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on V +MT Less Common +TG V +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on W +MT Less Common +TG W +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation on Y +MT Less Common +TG Y +PP Anywhere. +CF O +MM 15.99491462 +DR Unimod; 35 + +// + ID Oxidation to Kynurenine on W +MT Less Common +TG W +PP Anywhere. +CF C-1O +MM 3.99491462 +DR Unimod; 351 +DI HCD:194.06914219 + +// + ID Phosphoarginine on R +AC PTM-0250 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00227 +DR RESID; AA0222 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:40674 (Mammalia) +KW Phosphoprotein + +// + ID Phosphohistidine on H +AC PTM-0252 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00890 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Phosphoprotein + +// + ID Phosphorylation on S +MT Common Biological +TG S +PP Anywhere. +CF HO3P +MM 79.966330889 +DR Unimod; 21 +NL HCD:0 or HCD:97.976895573 + +// + ID Phosphorylation on T +MT Common Biological +TG T +PP Anywhere. +CF HO3P +MM 79.966330889 +DR Unimod; 21 +NL HCD:0 or HCD:97.976895573 + +// + ID Phosphorylation on Y +MT Common Biological +TG Y +PP Anywhere. +CF HO3P +MM 79.966330889 +DR Unimod; 21 +NL HCD:0 or HCD:97.976895573 +DI HCD:215.034744803 + +// + ID Phosphoserine on S +AC PTM-0253 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00046 +DR RESID; AA0037 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +TR Viruses; taxId:10239 (Viruses) +KW Phosphoprotein + +// + ID Phosphothreonine on T +AC PTM-0254 +MT UniProt +FT MOD_RES +TG T +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00047 +DR RESID; AA0038 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +TR Viruses; taxId:10239 (Viruses) +KW Phosphoprotein + +// + ID Phosphotyrosine on Y +AC PTM-0255 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00048 +DR RESID; AA0039 +TR Archaea; taxId:2287 (Sulfolobus solfataricus) +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +TR Viruses; taxId:10239 (Viruses) +KW Phosphoprotein + +// + ID Potassium on D +MT Metal +TG D +PP Anywhere. +CF H-1K +MM 37.955881454 +DR Unimod; 530 + +// + ID Potassium on E +MT Metal +TG E +PP Anywhere. +CF H-1K +MM 37.955881454 +DR Unimod; 530 + +// + ID Proline pyrrole to pyrrolidine six member ring on P +MT Less Common +TG P +PP Anywhere. +CF C +MM 12 + +// + ID Propionamidation on C +MT Less Common +TG C +PP Anywhere. +CF C3H5NO +MM 71.037113785 +DR Unimod; 24 + +// + ID Propionamidation on K +MT Less Common +TG K +PP Anywhere. +CF C3H5NO +MM 71.037113785 +DR Unimod; 24 + +// + ID Propionamidation on X +MT Less Common +TG X +PP Peptide N-terminal. +CF C3H5NO +MM 71.037113785 +DR Unimod; 24 + +// + ID Propionylation on K +MT Less Common +TG K +PP Anywhere. +CF C3H4O +MM 56.026214748 +DR Unimod; 58 +DI HCD:139.099823533 + +// + ID Pros-8alpha-FAD histidine on H +AC PTM-0258 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF C27H31N9O15P2 +MM 783.141485 +DR PSI-MOD; MOD:00153 +DR RESID; AA0144 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW FAD + +// + ID Pros-methylhistidine on H +AC PTM-0259 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00082 +DR RESID; AA0073 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Eukaryota; taxId:7742 (Vertebrata) +KW Methylation + +// + ID Pyrrolidinone on P +MT Less Common +TG P +PP Anywhere. +CF C-1H-2O-1 +MM -30.010564684 +DR Unimod; 360 + +// + ID Pyrrolidone carboxylic acid on Q +AC PTM-0261 +MT UniProt +FT MOD_RES +TG Q +PP N-terminal. +CF H-3N-1 +MM -17.026549 +DR PSI-MOD; MOD:00040 +DR RESID; AA0031 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW Pyrrolidone carboxylic acid + +// + ID Reduction on D +MT Less Common +TG D +PP Anywhere. +CF O-1 +MM -15.99491462 +DR Unimod; 447 + +// + ID Reduction on S +MT Less Common +TG S +PP Anywhere. +CF O-1 +MM -15.99491462 +DR Unimod; 447 + +// + ID Reduction on T +MT Less Common +TG T +PP Anywhere. +CF O-1 +MM -15.99491462 +DR Unimod; 447 + +// + ID S-(dipyrrolylmethanemethyl)cysteine on C +AC PTM-0421 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C20H22N2O8 +MM 418.137616 +DR PSI-MOD; MOD:00257 +DR RESID; AA0252 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) + +// + ID S-8alpha-FAD cysteine on C +AC PTM-0272 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C27H31N9O15P2 +MM 783.141485 +DR PSI-MOD; MOD:00152 +DR RESID; AA0143 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW FAD + +// + ID S-cysteinyl cysteine on C +AC PTM-0415 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C3H5NO2S +MM 119.004099 +DR PSI-MOD; MOD:00765 +DR RESID; AA0025 +TR Bacteria; taxId:91347 (Enterobacterales) +TR Eukaryota; taxId:40674 (Mammalia) + +// + ID S-glutathionyl cysteine on C +AC PTM-0311 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF C10H15N3O6S +MM 305.068156 +DR PSI-MOD; MOD:00234 +DR RESID; AA0229 +TR Bacteria; taxId:83333 (Escherichia coli (strain K12)) +TR Eukaryota; taxId:3981 (Hevea brasiliensis), taxId:7742 (Vertebrata) +KW Glutathionylation + +// + ID S-methylcysteine on C +AC PTM-0279 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00239 +DR RESID; AA0234 +TR Archaea; taxId:28890 (Euryarchaeota) +TR Bacteria; taxId:1421 (Lysinibacillus sphaericus) +TR Eukaryota; taxId:3055 (Chlamydomonas reinhardtii) +KW Methylation + +// + ID S-nitrosocysteine on C +AC PTM-0280 +MT UniProt +FT MOD_RES +TG C +PP Anywhere. +CF H-1NO +MM 28.990164 +DR PSI-MOD; MOD:00235 +DR RESID; AA0230 +TR Bacteria; taxId:1224 (Proteobacteria) +TR Eukaryota; taxId:40674 (Mammalia) +KW S-nitrosylation + +// + ID Sodium on D +MT Metal +TG D +PP Anywhere. +CF H-1Na +MM 21.98194425 +DR Unimod; 30 + +// + ID Sodium on E +MT Metal +TG E +PP Anywhere. +CF H-1Na +MM 21.98194425 +DR Unimod; 30 + +// + ID Sulfonation on S +MT Less Common +TG S +PP Anywhere. +CF O3S +MM 79.956815033 +DR Unimod; 40 +NL AnyActivationType:79.956815033 + +// + ID Sulfonation on T +MT Less Common +TG T +PP Anywhere. +CF O3S +MM 79.956815033 +DR Unimod; 40 +NL AnyActivationType:79.956815033 + +// + ID Sulfonation on Y +MT Common Biological +TG Y +PP Anywhere. +CF O3S +MM 79.956815033 +DR Unimod; 40 +NL AnyActivationType:79.956815033 + +// + ID Sulfotyrosine on Y +AC PTM-0286 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF O3S +MM 79.956815 +DR PSI-MOD; MOD:00181 +DR RESID; AA0172 +TR Eukaryota; taxId:33208 (Metazoa), taxId:33090 (Viridiplantae) +KW Sulfation + +// + ID Symmetric dimethylarginine on R +AC PTM-0287 +MT UniProt +FT MOD_RES +TG R +PP Anywhere. +CF C2H4 +MM 28.0313 +DR PSI-MOD; MOD:00076 +DR RESID; AA0067 +TR Eukaryota; taxId:7742 (Vertebrata) +KW Methylation + +// + ID Tele-8alpha-FAD histidine on H +AC PTM-0288 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF C27H31N9O15P2 +MM 783.141485 +DR PSI-MOD; MOD:00226 +DR RESID; AA0221 +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +KW FAD + +// + ID Tele-methylhistidine on H +AC PTM-0290 +MT UniProt +FT MOD_RES +TG H +PP Anywhere. +CF CH2 +MM 14.01565 +DR PSI-MOD; MOD:00322 +DR RESID; AA0317 +TR Eukaryota; taxId:5791 (Physarum polycephalum), taxId:7742 (Vertebrata) +KW Methylation + +// + ID Thyroxine on Y +AC PTM-0294 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF C6I4O +MM 595.612805 +DR PSI-MOD; MOD:00187 +DR RESID; AA0178 +TR Eukaryota; taxId:40674 (Mammalia) +KW Iodination + +// + ID Triiodothyronine on Y +AC PTM-0295 +MT UniProt +FT MOD_RES +TG Y +PP Anywhere. +CF C6HI3O +MM 469.716158 +DR PSI-MOD; MOD:00186 +DR RESID; AA0177 +TR Eukaryota; taxId:40674 (Mammalia) +KW Iodination + +// + ID Trioxidation on C +MT Less Common +TG C +PP Anywhere. +CF O3 +MM 47.984743859 +DR Unimod; 345 + +// + ID Water loss on D +MT Less Common +TG D +PP Anywhere. +CF H-2O-1 +MM -18.010564684 +DR Unimod; 23 + +// + ID Water Loss on E +MT Common Artifact +TG E +PP Peptide N-terminal. +CF H-2O-1 +MM -18.010564684 +DR Unimod; 23 + +// + ID Zinc on D +MT Metal +TG D +PP Anywhere. +CF H-2Zn +MM 61.913491946 +DR Unimod; 954 + +// + ID Zinc on E +MT Metal +TG E +PP Anywhere. +CF H-2Zn +MM 61.913491946 +DR Unimod; 954 + +// + + P63260 + ACTG_MOUSE + + + Actin, cytoplasmic 2 + + + + Actg1 + Actg + + + Mus musculus + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIVTNWDDMEKIWHHTFYNELRVAPEEHPVLLTEAPLNPKANREKMTQIMFETFNTPAMYVAIQAVLSLYASGRTTGIVMDSGDGVTHTVPIYEGYALPHAILRLDLAGRDLTDYLMKILTERGYSFTTTAEREIVRDIKEKLCYVALDFEQEMATAASSSSLEKSYELPDGQVITIGNERFRCPEALFQPSFLGMESCGIHETTFNSIMKCDVDIRKDLYANTVLSGGTTMYPGIADRMQKEITALAPSTMKIKIIAPPERKYSVWIGGSILASLSTFQQMWISKQEYDESGPSIVHRKCF + + \ No newline at end of file diff --git a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs index 8925661cb..b6ce4087f 100644 --- a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs +++ b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs @@ -110,9 +110,32 @@ public void LoadingIsReproducible(string fileName, DecoyType decoyType) // check are equivalent lists of proteins Assert.AreEqual(proteins1.Count, proteins2.Count); - // Because decoys are written in a parallel environment, there is no guarantee that the orders will be the same - CollectionAssert.AreEquivalent(proteins1.Select(p => p.Accession), proteins2.Select(p => p.Accession)); - CollectionAssert.AreEquivalent(proteins1.Select(p => p.BaseSequence), proteins2.Select(p => p.BaseSequence)); + // Because decoys are sorted before they are returned, the order should be identical + Assert.AreEqual(proteins1, proteins2); + } + + [Test] + [TestCase("proteinEntryLipidMoietyBindingRegion.xml", DecoyType.Reverse)] + public void LoadingLipidAsMod(string fileName, DecoyType decoyType) + { + var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(TestContext.CurrentContext.TestDirectory, "PSI-MOD.obo2.xml")); + Dictionary formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); + List UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"), formalChargesDictionary).ToList(); + + // Load in proteins + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", fileName); + List proteins1 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, UniProtPtms, false, null, out var unknownModifications); + List proteins2 = ProteinDbLoader.LoadProteinXML(dbPath, true, decoyType, UniProtPtms, false, null, out unknownModifications); + + // check are equivalent lists of proteins + Assert.AreEqual(proteins1.Count, proteins2.Count); + // Because decoys are sorted before they are returned, the order should be identical + Assert.AreEqual(proteins1, proteins2); + var oneBasedPossibleLocalizedModifications = proteins1[0].OneBasedPossibleLocalizedModifications[36]; + var firstMod = oneBasedPossibleLocalizedModifications.First(); + Assert.AreEqual("LIPID", firstMod.FeatureType); + Assert.AreEqual("Anywhere.", firstMod.LocationRestriction); + Assert.AreEqual("S-palmitoyl cysteine on C", firstMod.IdWithMotif); } [Test] @@ -476,6 +499,118 @@ public void Modification_read_write_into_proteinDb() Assert.AreEqual(0, ProteinDbLoader.GetPtmListFromProteinXml(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "xml.xml")).Count); } + [Test] + public void MultiMod_ProteinDbWriter() + { + Loaders.LoadElements(); + var sampleModList = PtmListLoader + .ReadModsFromFile(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "z.txt"), + out var errors).ToList(); + var currentMod = sampleModList.First(); + // create slightly different modifications + var newMod = new Modification(_originalId: "1" + currentMod.OriginalId, _target: currentMod.Target, + _modificationType: currentMod.ModificationType, + _accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction, + _featureType: currentMod.FeatureType, + _chemicalFormula: currentMod.ChemicalFormula); + var newMod2 = new Modification(_originalId: "2" + currentMod.OriginalId, _target: currentMod.Target, + _modificationType: currentMod.ModificationType, + _accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction, + _featureType: currentMod.FeatureType, + _chemicalFormula: currentMod.ChemicalFormula); + var newMod3 = new Modification(_originalId: "3" + currentMod.OriginalId, _target: currentMod.Target, + _modificationType: currentMod.ModificationType, + _accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction, + _featureType: currentMod.FeatureType, + _chemicalFormula: currentMod.ChemicalFormula); + var newMod4 = new Modification(_originalId: "4" + currentMod.OriginalId, _target: currentMod.Target, + _modificationType: currentMod.ModificationType, + _accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction, + _featureType: currentMod.FeatureType, + _chemicalFormula: currentMod.ChemicalFormula); + var newMod5 = new Modification(_originalId: "5" + currentMod.OriginalId, _target: currentMod.Target, + _modificationType: currentMod.ModificationType, + _accession: currentMod.Accession, _locationRestriction: currentMod.LocationRestriction, + _featureType: currentMod.FeatureType, + _chemicalFormula: currentMod.ChemicalFormula); + sampleModList.AddRange(new List() { newMod, newMod2, newMod3, newMod4, newMod5 }); + Assert.AreEqual(6, sampleModList.OfType().Count()); + // Create a protein with all possible modifications + Protein protein = new Protein( + "MCMCMCSSSSSSSS", + "accession", + "organism", + new List>(), + new Dictionary> + { + { 2, sampleModList.OfType().ToList() }, + { 4, sampleModList.OfType().ToList() }, + { 6, sampleModList.OfType().ToList() }, + }, + null, + "name", + "full_name", + false, + false, + new List(), + new List(), + disulfideBonds: new List()); + + Assert.AreEqual(6, protein.OneBasedPossibleLocalizedModifications[2].OfType().Count()); + Assert.AreEqual(18, protein.OneBasedPossibleLocalizedModifications.SelectMany(kvp => kvp.Value).Count()); + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), + new List { protein }, + Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins.xml")); + List newProteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "test_modifications_with_proteins.xml"), + true, DecoyType.None, new List(), false, new List(), + out Dictionary um); + + // Create a second protein with the same modifications, but listed in a different order. + sampleModList.Reverse(); + Protein modShuffledProtein = new Protein( + "MCMCMCSSSSSSSS", + "accession", + "organism", + new List>(), + new Dictionary> + { + { 2, sampleModList.OfType().ToList() }, + { 4, sampleModList.OfType().ToList() }, + { 6, sampleModList.OfType().ToList() }, + }, + null, + "name", + "full_name", + false, + false, + new List(), + new List(), + disulfideBonds: new List()); + string shuffledProteinFileName = Path.Combine(TestContext.CurrentContext.TestDirectory, + "test_shuffled_modifications_with_proteins.xml"); + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), + new List { modShuffledProtein }, shuffledProteinFileName); + List newShuffledProteins = ProteinDbLoader.LoadProteinXML(shuffledProteinFileName, + true, DecoyType.None, new List(), false, new List(), out um); + + // We've read in proteins from both databases. Assert that they are equal + Assert.AreEqual(newShuffledProteins.First().Accession, newProteins.First().Accession); + Assert.AreEqual(newShuffledProteins.First(), newProteins.First()); + + // Now, ensure that the modification dictionaries for each are equivalent (contain the same mods) and equal (contain the same mods in the same order) + for(int i = 1; i<4; i++) + { + int oneBasedResidue = i * 2; + + Assert.That(newShuffledProteins.First().OneBasedPossibleLocalizedModifications[oneBasedResidue], + Is.EquivalentTo(newProteins.First().OneBasedPossibleLocalizedModifications[oneBasedResidue])); + + Assert.That(newShuffledProteins.First().OneBasedPossibleLocalizedModifications[oneBasedResidue], + Is.EqualTo(newProteins.First().OneBasedPossibleLocalizedModifications[oneBasedResidue])); + } + } + [Test] public static void Test_MetaMorpheusStyleProteinDatabaseWriteAndREad() { diff --git a/mzLib/Test/DatabaseTests/proteinEntryLipidMoietyBindingRegion.xml b/mzLib/Test/DatabaseTests/proteinEntryLipidMoietyBindingRegion.xml new file mode 100644 index 000000000..44443a332 --- /dev/null +++ b/mzLib/Test/DatabaseTests/proteinEntryLipidMoietyBindingRegion.xml @@ -0,0 +1,1759 @@ + + + + P26678 + PPLA_HUMAN + + + Phospholamban + PLB + + + + PLN + PLB + + + Homo sapiens + Human + + + Eukaryota + Metazoa + Chordata + Craniata + Vertebrata + Euteleostomi + Mammalia + Eutheria + Euarchontoglires + Primates + Haplorrhini + Catarrhini + Hominidae + Homo + + + + + Structure of the rabbit phospholamban gene, cloning of the human cDNA, and assignment of the gene to human chromosome 6. + + + + + + + + + + + NUCLEOTIDE SEQUENCE [MRNA] + + + + Cloning of human cardiac phospholamban. + + + + + + NUCLEOTIDE SEQUENCE [MRNA] + + + + The human phospholamban gene: structure and expression. + + + + + + + + + + + + + NUCLEOTIDE SEQUENCE [GENOMIC DNA] + + + + The status, quality, and expansion of the NIH full-length cDNA project: the Mammalian Gene Collection (MGC). + + + + + + + NUCLEOTIDE SEQUENCE [LARGE SCALE MRNA] + + Liver + + + + + Mutation of the phospholamban promoter associated with hypertrophic cardiomyopathy. + + + + + + + + + + + + + + INVOLVEMENT IN CMH18 + + + + Ca2+ -dependent interaction of S100A1 with the sarcoplasmic reticulum Ca2+ -ATPase2a and phospholamban in the human heart. + + + + + + + + + + + + + + INTERACTION WITH S100A1 + SUBCELLULAR LOCATION + + + + Myotonic dystrophy protein kinase phosphorylates phospholamban and regulates calcium uptake in cardiomyocyte sarcoplasmic reticulum. + + + + + + + + + + + + + + + + PHOSPHORYLATION AT SER-16 BY DMPK + SUBCELLULAR LOCATION + + + + Ca2+-calmodulin-dependent protein kinase expression and signalling in skeletal muscle during exercise. + + + + + + + + + PHOSPHORYLATION AT THR-17 BY CAMK2 + + + + Phospholamban interacts with HAX-1, a mitochondrial protein with anti-apoptotic function. + + + + + + + + + + + + INTERACTION WITH HAX1 + SUBCELLULAR LOCATION + TISSUE SPECIFICITY + + + + Lethal, hereditary mutants of phospholamban elude phosphorylation by protein kinase A. + + + + + + + + + + FUNCTION + CHARACTERIZATION OF VARIANTS CMD1P HIS-9; LEU-9; CYS-9 AND ARG-14 DEL + PHOSPHORYLATION AT SER-16 + MUTAGENESIS OF ARG-13; ARG-14; SER-16 AND THR-17 + + + + An enzyme assisted RP-RPLC approach for in-depth analysis of human liver phosphoproteome. + + + + + + + + + + + + + + + + IDENTIFICATION BY MASS SPECTROMETRY [LARGE SCALE ANALYSIS] + + Liver + + + + + The ER-Localized Transmembrane Protein EPG-3/VMP1 Regulates SERCA Activity to Control ER-Isolation Membrane Contacts for Autophagosome Formation. + + + + + + + + + + + + + + + + + + + FUNCTION + INTERACTION WITH ATP2A2 AND VMP1 + + + + Micropeptide hetero-oligomerization adds complexity to the calcium pump regulatory network. + + + + + + + + + + + + SUBUNIT + INTERACTION WITH ATP2A2 + + + + Solution structure of the cytoplasmic domain of phospholamban: phosphorylation leads to a local perturbation in secondary structure. + + + + + + + + + + + + STRUCTURE BY NMR OF 1-25 + + + + Computational searching and mutagenesis suggest a structure for the pentameric transmembrane domain of phospholamban. + + + + + + + + + + 3D-STRUCTURE MODELING + + + + Using experimental information to produce a model of the transmembrane domain of the ion channel phospholamban. + + + + + + + + 3D-STRUCTURE MODELING + + + + The structure of phospholamban pentamer reveals a channel-like architecture in membranes. + + + + + + + + STRUCTURE BY NMR + SUBUNIT + + + + Structure determination of symmetric homo-oligomers by a complete search of symmetry configuration space, using NMR restraints and van der Waals packing. + + + + + + + + + + + STRUCTURE BY NMR + SUBUNIT + + + + Dilated cardiomyopathy and heart failure caused by a mutation in phospholamban. + + + + + + + + + + + + + + + + VARIANT CMD1P CYS-9 + CHARACTERIZATION OF VARIANT CMD1P CYS-9 + + + + A mutation in the human phospholamban gene, deleting arginine 14, results in lethal, hereditary cardiomyopathy. + + + + + + + + + + + + + + + + + + + VARIANT CMD1P ARG-14 DEL + CHARACTERIZATION OF VARIANT CMD1P ARG-14 DEL + + + + Mutations in the human phospholamban gene in patients with heart failure. + + + + + + + + + + + + + + + VARIANTS CMD1P HIS-9 AND LEU-9 + + + + Hydrophobic imbalance in the cytoplasmic domain of phospholamban is a determinant for lethal dilated cardiomyopathy. + + + + + + + + + CHARACTERIZATION OF VARIANTS CMD1P CYS-9 AND ARG-14 DEL + FUNCTION + SUBCELLULAR LOCATION + + + Reversibly inhibits the activity of ATP2A2/SERCA2 in cardiac sarcoplasmic reticulum by decreasing the apparent affinity of the ATPase for Ca(2+) (PubMed:28890335). Binds preferentially to the ATP-bound E1 conformational form of ATP2A2 which predominates at low Ca(2+) concentrations during the diastolic phase of the cardiac cycle (By similarity). Inhibits ATP2A2 Ca(2+) affinity by disrupting its allosteric activation by ATP (By similarity). Modulates the contractility of the heart muscle in response to physiological stimuli via its effects on ATP2A2. Modulates calcium re-uptake during muscle relaxation and plays an important role in calcium homeostasis in the heart muscle. The degree of ATP2A2 inhibition depends on the oligomeric state of PLN. ATP2A2 inhibition is alleviated by PLN phosphorylation (By similarity). Also inhibits the activity of ATP2A3/SERCA3 (By similarity). Controls intracellular Ca(2+) levels in elongated spermatids and may play a role in germ cell differentiation (By similarity). In the thalamic reticular nucleus of the brain, plays a role in the regulation of sleep patterns and executive functioning (By similarity). + + + Homopentamer (PubMed:16043693, PubMed:16897780). Can also form heterooligomers with other sarcoplasmic/endoplasmic reticulum calcium ATPase (SERCA) regulators ALN, SMIM6/ELN, SLN and STRIT1/DWORF (PubMed:36523160). Monomer (By similarity). Interacts with HAX1 (PubMed:17241641). Interacts as a monomer with ATP2A2; the interaction decreases ATP2A2 Ca(2+) affinity (PubMed:28890335, PubMed:36523160). Interacts with VMP1; VMP1 competes with PLN and SLN to prevent them from forming an inhibitory complex with ATP2A2 (PubMed:28890335). Interacts with S100A1 in a Ca(2+)-dependent manner (PubMed:12804600). + + + + P26678 + + + Q3SXY8 + + + false + 3 + + + + P26678 + + + P07307-3 + + + false + 3 + + + + P26678 + + + O15342 + + + false + 3 + + + + P26678 + + + Q9BXK5 + + + false + 8 + + + + P26678 + + + Q13323 + + + false + 3 + + + + P26678 + + + P19397 + + + false + 3 + + + + P26678 + + + O95471 + + + false + 3 + + + + P26678 + + + Q9UHP7-3 + + + false + 3 + + + + P26678 + + + Q7Z7G2 + + + false + 3 + + + + P26678 + + + O43889-2 + + + false + 3 + + + + P26678 + + + Q96BA8 + + + false + 3 + + + + P26678 + + + Q09013 + + + false + 4 + + + + P26678 + + + Q92838 + + + false + 11 + + + + P26678 + + + Q9GZR5 + + + false + 3 + + + + P26678 + + + Q5JX71 + + + false + 3 + + + + P26678 + + + Q14318 + + + false + 3 + + + + P26678 + + + Q8TBE3 + + + false + 3 + + + + P26678 + + + P48165 + + + false + 3 + + + + P26678 + + + Q8TDT2 + + + false + 3 + + + + P26678 + + + O60883 + + + false + 3 + + + + P26678 + + + Q8TED1 + + + false + 3 + + + + P26678 + + + P31937 + + + false + 3 + + + + P26678 + + + Q7Z5P4 + + + false + 3 + + + + P26678 + + + P43628 + + + false + 3 + + + + P26678 + + + Q5T700 + + + false + 4 + + + + P26678 + + + Q8N112 + + + false + 3 + + + + P26678 + + + Q9GZY8-5 + + + false + 3 + + + + P26678 + + + Q6N075 + + + false + 3 + + + + P26678 + + + Q99735 + + + false + 3 + + + + P26678 + + + O14880 + + + false + 3 + + + + P26678 + + + Q9GZW8 + + + false + 3 + + + + P26678 + + + Q9H2K0 + + + false + 3 + + + + P26678 + + + P15941-11 + + + false + 3 + + + + P26678 + + + Q8TBJ4 + + + false + 3 + + + + P26678 + + + O95197-3 + + + false + 3 + + + + P26678 + + + Q9NR31 + + + false + 3 + + + + P26678 + + + A0A0S2Z4U3 + + + false + 3 + + + + P26678 + + + Q9Y3P8 + + + false + 3 + + + + P26678 + + + Q15849 + + + false + 3 + + + + P26678 + + + Q8IWU4 + + + false + 3 + + + + P26678 + + + O95436-2 + + + false + 3 + + + + P26678 + + + Q9NQQ7-3 + + + false + 3 + + + + P26678 + + + Q9NP94 + + + false + 3 + + + + P26678 + + + Q9HBV2 + + + false + 3 + + + + P26678 + + + Q9NPE6 + + + false + 3 + + + + P26678 + + + Q16623 + + + false + 3 + + + + P26678 + + + P32856-2 + + + false + 3 + + + + P26678 + + + Q9BVX2 + + + false + 3 + + + + P26678 + + + Q7Z7N9 + + + false + 3 + + + + P26678 + + + Q6UW68 + + + false + 3 + + + + P26678 + + + Q9NWC5 + + + false + 3 + + + + P26678 + + + Q96B21 + + + false + 3 + + + + P26678 + + + Q4KMG9 + + + false + 3 + + + + P26678 + + + Q8N661 + + + false + 3 + + + + P26678 + + + O15393-2 + + + false + 3 + + + + Endoplasmic reticulum membrane + Single-pass membrane protein + + + Sarcoplasmic reticulum membrane + Single-pass membrane protein + + + Mitochondrion membrane + Single-pass membrane protein + + + Membrane + Single-pass membrane protein + + Colocalizes with HAX1 at the endoplasmic reticulum (PubMed:17241641). Colocalizes with DMPK at the sarcoplasmic reticulum (PubMed:15598648). + + + Heart muscle (at protein level). + + + Phosphorylation by PKA abolishes the inhibition of ATP2A2-mediated calcium uptake. Phosphorylated at Thr-17 by CaMK2, and in response to beta-adrenergic stimulation. Phosphorylation by DMPK may stimulate sarcoplasmic reticulum calcium uptake in cardiomyocytes. + + + Palmitoylated by ZDHHC16, promoting formation of the homopentamer. + + + In elongated spermatids, proteolytically cleaved by SPPL2C which modulates intracellular Ca(2+) homeostasis. + + + + Cardiomyopathy, dilated, 1P + CMD1P + A disorder characterized by ventricular dilation and impaired systolic function, resulting in congestive heart failure and arrhythmia. Patients are at risk of premature death. + + + The disease is caused by variants affecting the gene represented in this entry. + + + + Cardiomyopathy, familial hypertrophic, 18 + CMH18 + A hereditary heart disorder characterized by ventricular hypertrophy, which is usually asymmetric and often involves the interventricular septum. The symptoms include dyspnea, syncope, collapse, palpitations, and chest pain. They can be readily provoked by exercise. The disorder has inter- and intrafamilial variability ranging from benign to malignant forms with high risk of cardiac failure and sudden cardiac death. + + + The disease is caused by variants affecting the gene represented in this entry. + + + For practical reasons, PLN activity is most often studied with ATP2A1 instead of ATP2A2. + + + Belongs to the phospholamban family. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 3D-structure + Acetylation + Cardiomyopathy + Disease variant + Endoplasmic reticulum + Lipoprotein + Membrane + Mitochondrion + Palmitate + Phosphoprotein + Proteomics identification + Reference proteome + Sarcoplasmic reticulum + Transmembrane + Transmembrane helix + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + R + C + + + + + + R + H + + + + + + R + L + + + + + + + + + + + R + A + + + + + + R + A + + + + + + S + A + + + + + + T + A + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + MEKVQYLTRSAIRRASTIEMPQQARQKLQNLFINFCLILICLLLICIIVMLL + + +Copyrighted by the UniProt Consortium, see https://www.uniprot.org/terms Distributed under the Creative Commons Attribution (CC BY 4.0) License + + \ No newline at end of file diff --git a/mzLib/Test/DigestionAgentTests.cs b/mzLib/Test/DigestionAgentTests.cs new file mode 100644 index 000000000..c1cf05705 --- /dev/null +++ b/mzLib/Test/DigestionAgentTests.cs @@ -0,0 +1,13 @@ +using Omics.Digestion; +using Omics.Modifications; +using System.Collections.Generic; +using NUnit.Framework; + +namespace Test +{ + public class DigestionAgentTests + { + + + } +} diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/protein.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/protein.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/psm.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/psm.tsv new file mode 100644 index 000000000..3b205c248 --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_1/psm.tsv @@ -0,0 +1,5 @@ +Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00906.00906.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml VKEDPDGEHAR SISGRPIK.VKEDPDGEHAR.RAMQKVMA K R 11 3 2111.248 1251.5845 1251.5914 418.2021 418.2044 1251.5842 418.202 0.0072 0.05469976 15.518 11.386 0.8908 2 1 144 154 208463.97 0 FALSE sp|P52272|HNRPM_HUMAN P52272 HNRPM_HUMAN HNRNPM Heterogeneous nuclear ribonucleoprotein M "tr|M0QYQ7|M0QYQ7_HUMAN, tr|M0R019|M0R019_HUMAN, tr|M0R0N3|M0R0N3_HUMAN, tr|M0R2T0|M0R2T0_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00917.00917.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml NEEDEGHSNSSPR GAKIDASK.NEEDEGHSNSSPR.HSEAATAQ K H 13 3 2113.596 1456.5808 1456.5822 486.5342 486.5347 1456.5814 486.5344 0.0007 0.007893147 17.911 0 1 2 0 73 85 349264.44 0 FALSE sp|Q14103|HNRPD_HUMAN Q14103 HNRPD_HUMAN HNRNPD Heterogeneous nuclear ribonucleoprotein D0 "tr|A0A994J4B1|A0A994J4B1_HUMAN, tr|A0A994J4R1|A0A994J4R1_HUMAN, tr|D6RAF8|D6RAF8_HUMAN, tr|D6RD83|D6RD83_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.00947.00947.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml VGQADDSTKPTNK IGSFSGIR.VGQADDSTKPTNK.ASSTSITS R A 13 3 2120.3625 1359.6602 1359.6622 454.2273 454.228 1359.663 454.2283 -0.0007 0.001409289 12.904 0 0.9994 2 1 1339 1351 171548.62 0 FALSE sp|P35658|NU214_HUMAN P35658 NU214_HUMAN NUP214 Nuclear pore complex protein Nup214 "tr|A0A494C1F2|A0A494C1F2_HUMAN, tr|A0A8Q3SHZ4|A0A8Q3SHZ4_HUMAN, tr|B7ZAV2|B7ZAV2_HUMAN, tr|E9PKD2|E9PKD2_HUMAN, tr|H0Y837|H0Y837_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.01021.01021.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_1\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.pep.xml AEQEAEEPRK IAERARIK.AEQEAEEPRK.THSEEFTN K T 10 3 2136.586 1185.5615 1185.5641 396.1944 396.1953 1185.5625 396.1948 0.0015 0.151182 10.782 0 0.9548 2 1 106 115 125972.164 0 FALSE sp|Q9H788|SH24A_HUMAN Q9H788 SH24A_HUMAN SH2D4A SH2 domain-containing protein 4A tr|H0YAT1|H0YAT1_HUMAN diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/protein.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/protein.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/psm.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/psm.tsv new file mode 100644 index 000000000..f010cee8d --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/A_2/psm.tsv @@ -0,0 +1,5 @@ +Spectrum Spectrum File Peptide Modified Peptide Extended Peptide Prev AA Next AA Peptide Length Charge Retention Observed Mass Calibrated Observed Mass Observed M/Z Calibrated Observed M/Z Calculated Peptide Mass Calculated M/Z Delta Mass Expectation Hyperscore Nextscore Probability Number of Enzymatic Termini Number of Missed Cleavages Protein Start Protein End Intensity Assigned Modifications Observed Modifications Purity Is Unique Protein Protein ID Entry Name Gene Protein Description Mapped Genes Mapped Proteins +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01005.01005.2 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml HAVSEGTK IIPGEIAK.HAVSEGTK.AVTKYTSA K A 8 2 1938.0153 827.4154 827.4152 414.715 414.7149 827.4137 414.7141 0.0015 1.57772E-05 21.993 14.41 0.9994 2 0 110 117 8907404 0 FALSE sp|O60814|H2B1K_HUMAN O60814 H2B1K_HUMAN H2BC12 Histone H2B type 1-K "H2BC1, H2BC11, H2BC12L, H2BC13, H2BC14, H2BC15, H2BC17, H2BC18, H2BC21, H2BC26, H2BC3, H2BC4, H2BC5, H2BC9, H2BK1" "sp|A0A2R8Y619|H2BK1_HUMAN, sp|P06899|H2B1J_HUMAN, sp|P23527|H2B1O_HUMAN, sp|P33778|H2B1B_HUMAN, sp|P57053|H2BFS_HUMAN, sp|P58876|H2B1D_HUMAN, sp|P62807|H2B1C_HUMAN, sp|Q16778|H2B2E_HUMAN, sp|Q5QNW6|H2B2F_HUMAN, sp|Q8N257|H2B3B_HUMAN, sp|Q93079|H2B1H_HUMAN, sp|Q96A08|H2B1A_HUMAN, sp|Q99877|H2B1N_HUMAN, sp|Q99879|H2B1M_HUMAN, sp|Q99880|H2B1L_HUMAN, tr|U3KQK0|U3KQK0_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01551.01551.2 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml YDSTHGR DYAAYMFK.YDSTHGR.YAGEVSHD K Y 7 2 1976.3535 834.3639 834.3626 418.1892 418.1886 834.362 418.1883 0.0005 0.01685582 15.593 0 0.9997 2 0 47 53 1.96E+07 0 FALSE sp|P00359|G3P3_YEAST P00359 G3P3_YEAST TDH3 Glyceraldehyde-3-phosphate dehydrogenase 3 "GAPDHS, TDH1, TDH2" "sp|O14556|G3PT_HUMAN, sp|P00358|G3P2_YEAST, sp|P00360|G3P1_YEAST, tr|K7EP73|K7EP73_HUMAN" +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01565.01565.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml AESSQTCHSEQGDK AESSQTCHSEQGDK KSTQNSFR.AESSQTCHSEQGDK.KMEEKNSG R K 14 3 1977.164 1562.6292 1562.6265 521.8837 521.8828 1562.6267 521.8828 -0.0002 8.61986E-05 27.758 10.626 1 2 0 600 613 4579535.5 7C(57.0215) 0 TRUE sp|P46063|RECQ1_HUMAN P46063 RECQ1_HUMAN RECQL ATP-dependent DNA helicase Q1 +Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.01607.01607.3 E:\MadeleineH\Kelly_TwoProteomeMSFragger\A_2\interact-Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.pep.xml VTSTGRPGHASR ERSPWWVR.VTSTGRPGHASR.FMEDTAAE R F 12 3 1979.2815 1224.6323 1224.6318 409.218 409.2179 1224.6322 409.218 -0.0003 0.9328038 10.648 0 0.9687 2 1 198 209 1587950.4 0 FALSE sp|Q03154|ACY1_HUMAN Q03154 ACY1_HUMAN ACY1 Aminoacylase-1 ABHD14A-ACY1 "tr|A0A1B0GU86|A0A1B0GU86_HUMAN, tr|A0A1B0GV31|A0A1B0GV31_HUMAN, tr|A0A1B0GVA5|A0A1B0GVA5_HUMAN, tr|A0A1B0GW23|A0A1B0GW23_HUMAN, tr|C9JMV9|C9JMV9_HUMAN, tr|C9JYZ0|C9JYZ0_HUMAN" diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/combined_ion.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/combined_ion.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/copy_experiment_annotation.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/copy_experiment_annotation.tsv new file mode 100644 index 000000000..e69de29bb diff --git a/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/experiment_annotation.tsv b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/experiment_annotation.tsv new file mode 100644 index 000000000..3421dff60 --- /dev/null +++ b/mzLib/Test/FileReadingTests/ExternalFileTypes/EditedMSFraggerResults/experiment_annotation.tsv @@ -0,0 +1,3 @@ +file sample sample_name condition replicate +E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.raw A_1 A_1 A 1 +E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.raw A_2 A_2 A 2 diff --git a/mzLib/Test/FileReadingTests/TestBruker.cs b/mzLib/Test/FileReadingTests/TestBruker.cs index a7c450d8a..2d7d20710 100644 --- a/mzLib/Test/FileReadingTests/TestBruker.cs +++ b/mzLib/Test/FileReadingTests/TestBruker.cs @@ -1,4 +1,5 @@ -using System.IO; +using System.Diagnostics; +using System.IO; using MassSpectrometry; using NUnit; using NUnit.Framework; @@ -28,9 +29,8 @@ public void TestConstructors() public void TestFileDoesntExist() { string fakePath = "fakePath.d"; - var reader = MsDataFileReader.GetDataFile(fakePath); Assert.Throws(() => - reader.InitiateDynamicConnection()); + MsDataFileReader.GetDataFile(fakePath)); } [Test] @@ -140,15 +140,5 @@ public void TestPeakFiltering() var scan = MsDataFileReader.GetDataFile(_centroidPath).LoadAllStaticData(filteringParams).Scans[0]; Assert.That(scan.MassSpectrum.XArray.Length == 1); } - - [Test] - public void TestFileNotFoundExceptionThrown() - { - MsDataFile brukerReader = MsDataFileReader.GetDataFile("notrealfile.d"); - Assert.Throws(delegate - { - brukerReader.LoadAllStaticData(); - }); - } } } diff --git a/mzLib/Test/FileReadingTests/TestMsDataFileToResultsAdapter.cs b/mzLib/Test/FileReadingTests/TestMsDataFileToResultsAdapter.cs index 85b9425c3..9328ded99 100644 --- a/mzLib/Test/FileReadingTests/TestMsDataFileToResultsAdapter.cs +++ b/mzLib/Test/FileReadingTests/TestMsDataFileToResultsAdapter.cs @@ -3,6 +3,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.IO; +using System.Linq; using MassSpectrometry; using MzLibUtil; using NUnit.Framework; @@ -135,7 +136,8 @@ public void TestWriting(string filePath, string outfile, int loop) Assert.That(readInScan.OneBasedScanNumber.Equals(writtenScan.OneBasedScanNumber)); Assert.That(readInScan.MsnOrder.Equals(writtenScan.MsnOrder)); Assert.That(readInScan.IsCentroid.Equals(writtenScan.IsCentroid)); - Assert.That(readInScan.MassSpectrum.Equals(writtenScan.MassSpectrum)); + Assert.That(readInScan.MassSpectrum.YArray, Is.EquivalentTo(writtenScan.MassSpectrum.YArray)); + Assert.That(readInScan.MassSpectrum.XArray, Is.EquivalentTo(writtenScan.MassSpectrum.XArray)); } File.Delete(outfile); diff --git a/mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs b/mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs new file mode 100644 index 000000000..731284adb --- /dev/null +++ b/mzLib/Test/FileReadingTests/TestMsFraggerCombinedResults.cs @@ -0,0 +1,106 @@ +using NUnit.Framework; +using Readers; +using System.Collections.Generic; +using System.Linq; +using System.IO; +using TopDownProteomics; +using System.Diagnostics.CodeAnalysis; + +namespace Test.FileReadingTests +{ + [ExcludeFromCodeCoverage] + internal class TestMsFraggerCombinedResults + { + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults")] + public void TestLoadResultsCount(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); + ms.LoadResults(); + + NUnit.Framework.Assert.That(ms.AllPsmFiles.Count.Equals(2)); + NUnit.Framework.Assert.That(ms.Results.Count.Equals(8)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults")] + public void TestLoadResults(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); + ms.LoadResults(); + + List results = ms.Results.Select(psm => psm.FileName).ToList(); + + NUnit.Framework.Assert.That(results.Count(s => s.Contains("A_1")).Equals(4)); + NUnit.Framework.Assert.That(results.Count(s => s.Contains("A_2")).Equals(4)); + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults")] + public void TestFileNameToFilePathWithParameter(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); + ms.LoadResults(); + + List fullFilePath = new List(); + // these local files are not actually accessed, they are fillers to test the method + string fullFilePath1 = @"E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_11860_1x02nguL_8.raw"; + string fullFilePath2 = @"E:\MadeleineH\Raw_Files\Ex_AuLC1_30m_2D19_3_20um30cm_SPE50_15118120_OTOT_2215_HeYe_1.raw"; + fullFilePath.Add(fullFilePath1); + fullFilePath.Add(fullFilePath2); + + List results = ms.Results.Select(psm => psm.FileName).ToList(); + Dictionary allFiles = ms.FileNameToFilePath(fullFilePath); + List filePaths = ms.ExperimentAnnotations.Select(psm => psm.File).ToList(); + + foreach (var fileName in results) + { + NUnit.Framework.Assert.That(allFiles.TryGetValue(fileName, out var output)); + NUnit.Framework.Assert.That(filePaths.Contains(output)); + } + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults")] + public void TestFileNameToFilePathWithoutParameter(string path) + { + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + MsFraggerCombinedResults ms = new MsFraggerCombinedResults(filePath); + ms.LoadResults(); + + List results = ms.Results.Select(psm => psm.FileName).ToList(); + Dictionary allFiles = ms.FileNameToFilePath(); + List filePaths = ms.ExperimentAnnotations.Select(psm => psm.File).ToList(); + + foreach (var fileName in results) + { + NUnit.Framework.Assert.That(allFiles.TryGetValue(fileName, out var output)); + NUnit.Framework.Assert.That(filePaths.Contains(output)); + } + } + + [Test] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults\experiment_annotation.tsv")] + public void TestExperimentAnnotationFile(string path) + { + string fileToWrite = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults\copy_experiment_annotation.tsv"); + if (File.Exists(fileToWrite)) + { + File.Delete(fileToWrite); + } + + string fileToRead = Path.Combine(TestContext.CurrentContext.TestDirectory, path); + + ExperimentAnnotationFile experimentAnnotation = FileReader.ReadFile(fileToRead); + + experimentAnnotation.WriteResults(fileToWrite); + NUnit.Framework.Assert.That(File.Exists(fileToWrite)); + + File.Delete(fileToWrite); + } + } +} \ No newline at end of file diff --git a/mzLib/Test/FileReadingTests/TestMzML.cs b/mzLib/Test/FileReadingTests/TestMzML.cs index 826efca77..63107e471 100644 --- a/mzLib/Test/FileReadingTests/TestMzML.cs +++ b/mzLib/Test/FileReadingTests/TestMzML.cs @@ -741,7 +741,7 @@ public void WriteMzmlTest() Assert.AreEqual(2, reader.GetClosestOneBasedSpectrumNumber(2)); var newFirstValue = reader.GetOneBasedScan(1).MassSpectrum.FirstX; - Assert.AreEqual(oldFirstValue.Value, newFirstValue.Value, 1e-9); + Assert.AreEqual(oldFirstValue.Value, newFirstValue.Value); var secondScan2 = reader.GetOneBasedScan(2); diff --git a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs index 2018158b1..7719d4816 100644 --- a/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs +++ b/mzLib/Test/FileReadingTests/TestPsmFromTsv.cs @@ -8,7 +8,6 @@ using System.Text.RegularExpressions; using Omics.Fragmentation; using Omics.SpectrumMatch; -using Proteomics; using Readers; namespace Test.FileReadingTests @@ -76,7 +75,6 @@ public static void ReadOGlycoPsmsLocalizedGlycans() } Assert.AreEqual(1, localGlycans.Count); - } [Test] @@ -188,7 +186,7 @@ public static void TestParseModification() // psm with two mods on the same amino acid string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK"; - modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(fullSeq); + modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(fullSeq, true, true); Assert.That(modDict.Count == 1); Assert.That(modDict.ContainsKey(0)); Assert.That(modDict[0].Count == 2); diff --git a/mzLib/Test/FileReadingTests/TestRawFileReader.cs b/mzLib/Test/FileReadingTests/TestRawFileReader.cs index 0b67c1508..7ebfae6ff 100644 --- a/mzLib/Test/FileReadingTests/TestRawFileReader.cs +++ b/mzLib/Test/FileReadingTests/TestRawFileReader.cs @@ -171,9 +171,10 @@ public static void TestPeakFilteringRawFileReader(string infile) for (int j = 0; j < mzmlScan.MassSpectrum.XArray.Length; j++) { - double roundedMzmlMz = Math.Round(mzmlScan.MassSpectrum.XArray[j], 2); - double roundedRawMz = Math.Round(rawScan.MassSpectrum.XArray[j], 2); + double roundedRawMz = Math.Round(rawScan.MassSpectrum.XArray[j], 4); + double roundedMzmlMz = Math.Round(mzmlScan.MassSpectrum.XArray[j], 4); + // XArray is rounded to the 4th digit during CreateAndWrite Assert.AreEqual(roundedMzmlMz, roundedRawMz); double roundedMzmlIntensity = Math.Round(mzmlScan.MassSpectrum.XArray[j], 0); diff --git a/mzLib/Test/FileReadingTests/TestSupportedFileExtensions.cs b/mzLib/Test/FileReadingTests/TestSupportedFileExtensions.cs index ef7a238b0..868a6dc3a 100644 --- a/mzLib/Test/FileReadingTests/TestSupportedFileExtensions.cs +++ b/mzLib/Test/FileReadingTests/TestSupportedFileExtensions.cs @@ -16,7 +16,8 @@ internal class TestSupportedFileExtensions [TestCase("DataFiles/sliced_ethcd.raw", SupportedFileType.ThermoRaw)] [TestCase("DataFiles/SmallCalibratibleYeast.mzml", SupportedFileType.MzML)] [TestCase("DataFiles/tester.mgf", SupportedFileType.Mgf)] - [TestCase("DataFiles/tester.d", SupportedFileType.BrukerD)] + [TestCase("DataFiles/centroid_1x_MS1_4x_autoMS2.d", SupportedFileType.BrukerD)] + [TestCase("DataFiles/timsTOF_snippet.d", SupportedFileType.BrukerTimsTof)] [TestCase(@"FileReadingTests\ExternalFileTypes\Ms2Feature_FlashDeconvjurkat_td_rep1_fract2_ms2.feature", SupportedFileType.Ms2Feature)] [TestCase(@"FileReadingTests\ExternalFileTypes\TopFDMs1Feature_jurkat_td_rep1_fract2_ms1.feature", SupportedFileType.Ms1Feature)] [TestCase(@"FileReadingTests\ExternalFileTypes\TopFDmzrt_jurkat_td_rep1_fract2_frac.mzrt.csv", SupportedFileType.Mzrt_TopFd)] @@ -37,6 +38,7 @@ internal class TestSupportedFileExtensions [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_DecoyResults_IcDecoy.tsv", SupportedFileType.MsPathFinderTDecoys)] [TestCase(@"FileReadingTests\ExternalFileTypes\MsPathFinderT_AllResults_IcTda.tsv", SupportedFileType.MsPathFinderTAllResults)] [TestCase(@"FileReadingTests\ExternalFileTypes\crux.txt", SupportedFileType.CruxResult)] + [TestCase(@"FileReadingTests\ExternalFileTypes\EditedMSFraggerResults\experiment_annotation.tsv", SupportedFileType.ExperimentAnnotation)] public static void TestSupportedFileTypeExtensions(string filePath, SupportedFileType expectedType) { var supportedType = filePath.ParseFileType(); @@ -56,6 +58,7 @@ public static void EnsureAllExtensionsAreUnique() [Test] public static void TestSupportedFileTypeExtension_Errors() { + string badTest = "badFile.taco"; Exception e = Assert.Throws(() => badTest.ParseFileType()); Assert.That(e?.Message, Is.EqualTo($"File type not supported")); diff --git a/mzLib/Test/FileReadingTests/TestTimsTofFileReader.cs b/mzLib/Test/FileReadingTests/TestTimsTofFileReader.cs new file mode 100644 index 000000000..d1bd523ca --- /dev/null +++ b/mzLib/Test/FileReadingTests/TestTimsTofFileReader.cs @@ -0,0 +1,315 @@ +using MassSpectrometry; +using MathNet.Numerics; +using NUnit.Framework; +using NUnit.Framework.Legacy; +using Readers; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using Assert = NUnit.Framework.Legacy.ClassicAssert; + +namespace Test.FileReadingTests +{ + [TestFixture] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] + public class TestTimsTofFileReader + { + + public string _testDataPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "timsTOF_snippet.d"); + public TimsTofFileReader _testReader; + public TimsDataScan _testMs2Scan; + public TimsDataScan _testMs1Scan; + public FilteringParams _filteringParams = new FilteringParams(numberOfPeaksToKeepPerWindow:200, minimumAllowedIntensityRatioToBasePeak: 0.01); + + [OneTimeSetUp] + public void SetUp() + { + _testReader = new TimsTofFileReader(_testDataPath); + _testReader.LoadAllStaticData(filteringParams: _filteringParams, maxThreads: 10); + _testMs2Scan = (TimsDataScan)_testReader.Scans.Skip(1000).First(scan => scan.MsnOrder > 1); + _testMs1Scan = (TimsDataScan)_testReader.Scans.Skip(500).First(scan => scan.MsnOrder == 1); + } + + [Test] + public void TestGetPasefScanFromDynamicConnectionUsingFrameId() + { + var dynamicReader = new TimsTofFileReader(_testDataPath); + dynamicReader.InitiateDynamicConnection(); + var dynamicScan = dynamicReader + .GetScanFromPrecursorAndFrameIdFromDynamicConnection((int)_testMs2Scan.PrecursorId, (int)_testMs2Scan.FrameId, _filteringParams); + Assert.IsNotNull(dynamicScan); + + Assert.That(dynamicScan.PrecursorId, Is.EqualTo(_testMs2Scan.PrecursorId), "PrecursorId values are not equal."); + Assert.That(dynamicScan.ScanNumberStart, Is.EqualTo(_testMs2Scan.ScanNumberStart), "ScanStart values are not equal."); + Assert.That(dynamicScan.ScanNumberEnd, Is.EqualTo(_testMs2Scan.ScanNumberEnd), "ScanEnd values are not equal."); + Assert.That(dynamicScan.OneOverK0, Is.EqualTo(_testMs2Scan.OneOverK0), "ScanMedian values are not equal."); + Assert.That(dynamicScan.IsolationMz, Is.EqualTo(_testMs2Scan.IsolationMz), "IsolationMz values are not equal."); + Assert.That(dynamicScan.IsolationWidth, Is.EqualTo(_testMs2Scan.IsolationWidth), "IsolationWidth values are not equal."); + Assert.That(dynamicScan.HcdEnergy, Is.EqualTo(_testMs2Scan.HcdEnergy), "CollisionEnergy values are not equal."); + Assert.That(dynamicScan.SelectedIonMZ, Is.EqualTo(_testMs2Scan.SelectedIonMZ), "MostAbundantPrecursorMz values are not equal."); + Assert.That(dynamicScan.SelectedIonMonoisotopicGuessMz, Is.EqualTo(_testMs2Scan.SelectedIonMonoisotopicGuessMz), "PrecursorMonoisotopicMz values are not equal."); + Assert.That(dynamicScan.PrecursorId, Is.EqualTo(_testMs2Scan.PrecursorId), "PrecursorID values are not equal."); + Assert.That(dynamicScan.MassSpectrum, Is.EqualTo(_testMs2Scan.MassSpectrum), "Mass spectra are not equal"); + } + + [Test] + public void TestGetMs1ScanFromDynamicConnectionUsingFrameId() + { + var dynamicReader = new TimsTofFileReader(_testDataPath); + dynamicReader.InitiateDynamicConnection(); + var dynamicScan = dynamicReader + .GetScanFromPrecursorAndFrameIdFromDynamicConnection((int)_testMs1Scan.PrecursorId, (int)_testMs1Scan.FrameId, _filteringParams); + Assert.IsNotNull(dynamicScan); + + Assert.That(dynamicScan.PrecursorId, Is.EqualTo(_testMs1Scan.PrecursorId), "PrecursorId values are not equal."); + Assert.That(dynamicScan.ScanNumberStart, Is.EqualTo(_testMs1Scan.ScanNumberStart), "ScanStart values are not equal."); + Assert.That(dynamicScan.ScanNumberEnd, Is.EqualTo(_testMs1Scan.ScanNumberEnd), "ScanEnd values are not equal."); + Assert.That(dynamicScan.OneOverK0, Is.EqualTo(_testMs1Scan.OneOverK0), "ScanMedian values are not equal."); + Assert.That(dynamicScan.IsolationMz, Is.EqualTo(_testMs1Scan.IsolationMz), "IsolationMz values are not equal."); + Assert.That(dynamicScan.IsolationWidth, Is.EqualTo(_testMs1Scan.IsolationWidth), "IsolationWidth values are not equal."); + Assert.That(dynamicScan.HcdEnergy, Is.EqualTo(_testMs1Scan.HcdEnergy), "CollisionEnergy values are not equal."); + Assert.That(dynamicScan.SelectedIonMZ, Is.EqualTo(_testMs1Scan.SelectedIonMZ), "MostAbundantPrecursorMz values are not equal."); + Assert.That(dynamicScan.SelectedIonMonoisotopicGuessMz, Is.EqualTo(_testMs1Scan.SelectedIonMonoisotopicGuessMz), "PrecursorMonoisotopicMz values are not equal."); + Assert.That(dynamicScan.PrecursorId, Is.EqualTo(_testMs1Scan.PrecursorId), "PrecursorID values are not equal."); + Assert.That(dynamicScan.MassSpectrum, Is.EqualTo(_testMs1Scan.MassSpectrum), "Mass spectra are not equal"); + } + + [Test] + public void TestGetScanFromDynamicConnectionUsingOneBasedScanNumber() + { + var dynamicReader = new TimsTofFileReader(_testDataPath); + dynamicReader.InitiateDynamicConnection(); + var scanBeforeCast = dynamicReader.GetOneBasedScanFromDynamicConnection(_testMs1Scan.OneBasedScanNumber, _filteringParams); + var dynamicScan = scanBeforeCast as TimsDataScan; + Assert.IsNotNull(dynamicScan); + + Assert.That(dynamicScan.PrecursorId, Is.EqualTo(_testMs1Scan.PrecursorId), "PrecursorId values are not equal."); + Assert.That(dynamicScan.ScanNumberStart, Is.EqualTo(_testMs1Scan.ScanNumberStart), "ScanStart values are not equal."); + Assert.That(dynamicScan.ScanNumberEnd, Is.EqualTo(_testMs1Scan.ScanNumberEnd), "ScanEnd values are not equal."); + Assert.That(dynamicScan.OneOverK0, Is.EqualTo(_testMs1Scan.OneOverK0), "ScanMedian values are not equal."); + Assert.That(dynamicScan.IsolationMz, Is.EqualTo(_testMs1Scan.IsolationMz), "IsolationMz values are not equal."); + Assert.That(dynamicScan.IsolationWidth, Is.EqualTo(_testMs1Scan.IsolationWidth), "IsolationWidth values are not equal."); + Assert.That(dynamicScan.HcdEnergy, Is.EqualTo(_testMs1Scan.HcdEnergy), "CollisionEnergy values are not equal."); + Assert.That(dynamicScan.SelectedIonMZ, Is.EqualTo(_testMs1Scan.SelectedIonMZ), "MostAbundantPrecursorMz values are not equal."); + Assert.That(dynamicScan.SelectedIonMonoisotopicGuessMz, Is.EqualTo(_testMs1Scan.SelectedIonMonoisotopicGuessMz), "PrecursorMonoisotopicMz values are not equal."); + Assert.That(dynamicScan.PrecursorId, Is.EqualTo(_testMs1Scan.PrecursorId), "PrecursorID values are not equal."); + Assert.That(dynamicScan.MassSpectrum, Is.EqualTo(_testMs1Scan.MassSpectrum), "Mass spectra are not equal"); + Assert.That(dynamicScan.OneBasedScanNumber, Is.EqualTo(_testMs1Scan.OneBasedScanNumber)); + + + scanBeforeCast = dynamicReader.GetOneBasedScanFromDynamicConnection(_testMs2Scan.OneBasedScanNumber, _filteringParams); + dynamicScan = scanBeforeCast as TimsDataScan; + Assert.IsNotNull(dynamicScan); + + Assert.That(dynamicScan.PrecursorId, Is.EqualTo(_testMs2Scan.PrecursorId), "PrecursorId values are not equal."); + Assert.That(dynamicScan.ScanNumberStart, Is.EqualTo(_testMs2Scan.ScanNumberStart), "ScanStart values are not equal."); + Assert.That(dynamicScan.ScanNumberEnd, Is.EqualTo(_testMs2Scan.ScanNumberEnd), "ScanEnd values are not equal."); + Assert.That(dynamicScan.OneOverK0, Is.EqualTo(_testMs2Scan.OneOverK0), "ScanMedian values are not equal."); + Assert.That(dynamicScan.IsolationMz, Is.EqualTo(_testMs2Scan.IsolationMz), "IsolationMz values are not equal."); + Assert.That(dynamicScan.IsolationWidth, Is.EqualTo(_testMs2Scan.IsolationWidth), "IsolationWidth values are not equal."); + Assert.That(dynamicScan.HcdEnergy, Is.EqualTo(_testMs2Scan.HcdEnergy), "CollisionEnergy values are not equal."); + Assert.That(dynamicScan.SelectedIonMZ, Is.EqualTo(_testMs2Scan.SelectedIonMZ), "MostAbundantPrecursorMz values are not equal."); + Assert.That(dynamicScan.SelectedIonMonoisotopicGuessMz, Is.EqualTo(_testMs2Scan.SelectedIonMonoisotopicGuessMz), "PrecursorMonoisotopicMz values are not equal."); + Assert.That(dynamicScan.PrecursorId, Is.EqualTo(_testMs2Scan.PrecursorId), "PrecursorID values are not equal."); + Assert.That(dynamicScan.MassSpectrum, Is.EqualTo(_testMs2Scan.MassSpectrum), "Mass spectra are not equal"); + Assert.That(dynamicScan.OneBasedScanNumber, Is.EqualTo(_testMs2Scan.OneBasedScanNumber)); + } + + [Test] + public void TestTwoPointerMerge() + { + uint[] indices1 = new uint[] { 1, 3, 5, 7, 9, 11 }; + uint[] indices2 = new uint[] { 0, 2, 4, 6, 8, 10 }; + + int[] intensities1 = new int[] { 1, 3, 5, 7, 9, 11 }; + int[] intensities2 = new int[] { 0, 2, 4, 6, 8, 10 }; + + int[] intendedOutput = new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + + var mergerOutput = TofSpectraMerger.TwoPointerMerge(indices1, indices2, intensities1, intensities2); + + Assert.That(mergerOutput.Intensities, Is.EqualTo(intendedOutput)); + Assert.That(mergerOutput.Indices.Select(i => (int)i).ToArray(), Is.EqualTo(intendedOutput)); + + indices2 = new uint[] { 0, 2, 4, 6, 8, 10, 12, 13, 14, 15, 16 }; + + intensities2 = new int[] { 0, 2, 4, 6, 8, 10, 12, 13, 14, 15, 16 }; + + intendedOutput = new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; + + mergerOutput = TofSpectraMerger.TwoPointerMerge(indices1, indices2, intensities1, intensities2); + + Assert.That(mergerOutput.Intensities, Is.EqualTo(intendedOutput)); + Assert.That(mergerOutput.Indices.Select(i => (int)i).ToArray(), Is.EqualTo(intendedOutput)); + } + + [Test] + public void TestCollapse() + { + uint[] indices = new uint[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + int[] intensities = new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }; + + List intendedIdx = new List { 1, 4, 7, 10 }; + List intendedIntensities = new List { 3, 12, 21, 30 }; + + var collapsedOutput = TofSpectraMerger.CollapseArrays(indices, intensities); + + Assert.That(collapsedOutput.Indices, Is.EqualTo(intendedIdx)); + Assert.That(collapsedOutput.Intensities, Is.EqualTo(intendedIntensities)); + + + indices = new uint[] { 0, 1, 2, 3, 4, 5, 6, 7, 9, 11 }; + intensities = new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 9, 11 }; + + intendedIdx = new List { 1, 4, 6, 9 }; + intendedIntensities = new List { 3, 12, 13, 20 }; + + collapsedOutput = TofSpectraMerger.CollapseArrays(indices, intensities); + + Assert.That(collapsedOutput.Indices, Is.EqualTo(intendedIdx)); + Assert.That(collapsedOutput.Intensities, Is.EqualTo(intendedIntensities)); + + indices = new uint[] { 0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 11, 18, 523, 1000, 1000, 1000 }; + intensities = new int[] { 0, 1, 2, 3, 4, 5, 6, 7, 9, 11, 11, 18, 523, 1000, 1000, 1000 }; + + intendedIdx = new List { 1, 4, 6, 11, 18, 523, 1000 }; + intendedIntensities = new List { 3, 12, 13, 31, 18, 523, 3000 }; + + collapsedOutput = TofSpectraMerger.CollapseArrays(indices, intensities); + + Assert.That(collapsedOutput.Indices, Is.EqualTo(intendedIdx)); + Assert.That(collapsedOutput.Intensities, Is.EqualTo(intendedIntensities)); + } + + [Test] + public void TestConstructor() + { + var reader = MsDataFileReader.GetDataFile(_testDataPath); + Assert.That(reader, !Is.Null); + } + + [Test] + public void TestFileDoesntExist() + { + string fakePath = "fakePath.d"; + Assert.Throws(() => + MsDataFileReader.GetDataFile(fakePath)); + + TimsTofFileReader reader = new TimsTofFileReader(fakePath); + + Assert.Throws(() => + reader.LoadAllStaticData()); + } + + + [Test] + public void TestLoadAllStaticData() + { + Assert.That(_testReader.NumSpectra, Is.EqualTo(4096)); + + Assert.That(_testMs2Scan.Polarity == Polarity.Positive); + Assert.That(_testMs2Scan.DissociationType == DissociationType.CID); + Assert.That(_testMs2Scan.TotalIonCurrent == 25130); + Assert.That(_testMs2Scan.NativeId == "frames=64-64;scans=410-435"); + Assert.That(_testMs2Scan.SelectedIonMZ, Is.EqualTo(739.3668).Within(0.001)); + Assert.That(_testMs2Scan.MsnOrder == 2); + Assert.That(_testMs2Scan.IsCentroid); + Assert.That(_testMs2Scan.ScanNumberStart == 410); + Assert.That(_testMs2Scan.OneOverK0, Is.EqualTo(1.0424).Within(0.0001)); + } + + [Test] + public void TestOneBasedPrecursor() + { + TimsDataScan ms1Scan = (TimsDataScan)_testReader.GetOneBasedScan((int)_testMs2Scan.OneBasedPrecursorScanNumber); + + Assert.AreEqual(_testMs2Scan.PrecursorId, ms1Scan.PrecursorId); + // Check that the child and parent scan are both looking at the same timsScans (i.e., the same region in the ion-mobility dimension) + Assert.AreEqual(_testMs2Scan.ScanNumberStart, ms1Scan.ScanNumberStart); + Assert.AreEqual(_testMs2Scan.ScanNumberEnd, ms1Scan.ScanNumberEnd); + Assert.AreEqual(_testMs2Scan.OneOverK0, ms1Scan.OneOverK0); + + } + + [Test] + public void TestSpectraMerger() + { + double[] mz1 = new double[] { 1, 3, 5, 7, 9 }; + double[] mz2 = new double[] { 2, 4, 6, 8, 10 }; + + int[] intensity1 = new int[] { 1, 3, 5, 7, 9 }; + int[] intensity2 = new int[] { 2, 4, 6, 8, 10 }; + + MzSpectrum outSpectrum = TofSpectraMerger.MergeArraysToMs2Spectrum( + new List { mz1, mz2 }, + new List { intensity1, intensity2 }); + + Assert.AreEqual(outSpectrum.Size, 10); + CollectionAssert.AreEqual(outSpectrum.XArray, new double[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }); + } + + [Test] + public void TestSpectraMerger2() + { + double[] mz1 = new double[] { 1, 3, 5, 7, 9, 10 }; + double[] mz2 = new double[] { 2, 4, 6, 8, 10 }; + + int[] intensity1 = new int[] { 1, 3, 5, 7, 9, 10 }; + int[] intensity2 = new int[] { 2, 4, 6, 8, 10 }; + + MzSpectrum outSpectrum = TofSpectraMerger.MergeArraysToMs2Spectrum( + new List { mz1, mz2 }, + new List { intensity1, intensity2 }); + + Assert.AreEqual(outSpectrum.Size, 10); + CollectionAssert.AreEqual(outSpectrum.XArray, new double[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }); + CollectionAssert.AreEqual(outSpectrum.YArray, new double[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 20 }); + } + + [Test] + public void TestSpectraMerger3() + { + double[] mz1 = new double[] { 1, 4, 7, 10 }; + double[] mz2 = new double[] { 2, 5, 8 }; + double[] mz3 = new double[] { 3, 6, 9 }; + + int[] intensity1 = new int[] { 1, 4, 7, 10 }; + int[] intensity2 = new int[] { 2, 5, 8 }; + int[] intensity3 = new int[] { 3, 6, 9 }; + + MzSpectrum outSpectrum = TofSpectraMerger.MergeArraysToMs2Spectrum( + new List { mz1, mz2, mz3 }, + new List { intensity1, intensity2, intensity3 }); + + Assert.AreEqual(outSpectrum.Size, 10); + CollectionAssert.AreEqual(outSpectrum.XArray, new double[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }); + CollectionAssert.AreEqual(outSpectrum.YArray, new double[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }); + } + + // Test that weighted averaging works when two peaks are close together + [Test] + public void TestSpectraMerger4() + { + double[] mz1 = new double[] { 1, 3, 5, 7, 9 }; + double[] mz2 = new double[] { 2, 4, 6, 8, 10 }; + double[] mz3 = new double[] { 1 + 1e-6, 2 + 1e-6, 11 + 1e-6 }; + + int[] intensity1 = new int[] { 1, 3, 5, 7, 9 }; + int[] intensity2 = new int[] { 2, 4, 6, 8, 10 }; + int[] intensity3 = new int[] { 10, 10, 11 }; + + MzSpectrum outSpectrum = TofSpectraMerger.MergeArraysToMs2Spectrum( + new List { mz1, mz2, mz3 }, + new List { intensity1, intensity2, intensity3 }); + + Assert.AreEqual(outSpectrum.Size, 11); + // Peaks (mz = 1, intensity = 1) and (mz = 1+1e-6, intensity = 10) are close together, so they should be averaged + // Same thing for (mz = 2, intensity = 2) and (mz = 2+1e-6, intensity = 10) + CollectionAssert.AreEqual(outSpectrum.XArray.Select(mz => mz.Round(7)).ToArray(), + new double[] { 1 + 9e-7, 2 + 8e-7, 3, 4, 5, 6, 7, 8, 9, 10, 11 + 1e-6 }); + CollectionAssert.AreEqual(outSpectrum.YArray, new double[] { 11, 12, 3, 4, 5, 6, 7, 8, 9, 10, 11 }); + } + } +} diff --git a/mzLib/Test/ObjectPoolTests.cs b/mzLib/Test/ObjectPoolTests.cs new file mode 100644 index 000000000..bac0600eb --- /dev/null +++ b/mzLib/Test/ObjectPoolTests.cs @@ -0,0 +1,120 @@ +using MzLibUtil; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; + +namespace Test; + +[TestFixture] +[ExcludeFromCodeCoverage] +public class HashSetPoolTests +{ + [Test] + public void Get_ReturnsHashSetInstance() + { + var pool = new HashSetPool(); + var hashSet = pool.Get(); + Assert.That(hashSet, Is.Not.Null); + pool.Return(hashSet); + } + + [Test] + public void Return_ClearsHashSetBeforeReturningToPool() + { + var pool = new HashSetPool(); + var hashSet = pool.Get(); + hashSet.Add(1); + pool.Return(hashSet); + Assert.That(hashSet.Count, Is.EqualTo(0)); + } + + [Test] + public void Return_ThrowsArgumentNullException_WhenHashSetIsNull() + { + var pool = new HashSetPool(); + Assert.Throws(() => pool.Return(null)); + } +} + +[TestFixture] +[ExcludeFromCodeCoverage] +public class DictionaryPoolTests +{ + [Test] + public void Get_ReturnsDictionaryInstance() + { + var dictionaryPool = new DictionaryPool(); + var dictionary = dictionaryPool.Get(); + Assert.That(dictionary, Is.Not.Null); + Assert.That(dictionary, Is.InstanceOf>()); + } + + [Test] + public void Return_ClearsAndReturnsDictionaryToPool() + { + var dictionaryPool = new DictionaryPool(); + var dictionary = dictionaryPool.Get(); + dictionary["key"] = 42; + + dictionaryPool.Return(dictionary); + + Assert.That(dictionary.Count, Is.EqualTo(0)); + } + + [Test] + public void Return_ThrowsArgumentNullException_WhenDictionaryIsNull() + { + var dictionaryPool = new DictionaryPool(); + Assert.Throws(() => dictionaryPool.Return(null)); + } +} + +[TestFixture] +[ExcludeFromCodeCoverage] +public class ListPoolTests +{ + [Test] + public void ListPool_Get_ReturnsListWithInitialCapacity() + { + // Arrange + int initialCapacity = 16; + var listPool = new ListPool(initialCapacity); + + // Act + var list = listPool.Get(); + + // Assert + Assert.That(list, Is.Not.Null); + Assert.That(list.Capacity, Is.EqualTo(initialCapacity)); + } + + [Test] + public void ListPool_Return_ClearsListBeforeReturningToPool() + { + // Arrange + var listPool = new ListPool(); + var list = listPool.Get(); + list.Add(1); + list.Add(2); + + // Act + listPool.Return(list); + var returnedList = listPool.Get(); + + // Assert + Assert.That(returnedList, Is.Not.Null); + Assert.That(returnedList, Is.Empty); + } + + [Test] + public void ListPool_Return_ThrowsArgumentNullException_WhenListIsNull() + { + // Arrange + var listPool = new ListPool(); + + // Act & Assert + Assert.That(() => listPool.Return(null), Throws.ArgumentNullException); + } +} + diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index b58d87522..81b9ab12d 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -39,6 +39,15 @@ + + Always + + + Always + + + Always + Always @@ -288,6 +297,30 @@ Always + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + + + PreserveNewest + Always @@ -414,10 +447,10 @@ Always - + Always - + Always @@ -494,6 +527,24 @@ Always + + + Always + + + PreserveNewest + + + Always + + + Always + + + PreserveNewest + + + PreserveNewest Always diff --git a/mzLib/Test/TestDeconvolution.cs b/mzLib/Test/TestDeconvolution.cs index 8f6cbb9ee..65d6d6c0c 100644 --- a/mzLib/Test/TestDeconvolution.cs +++ b/mzLib/Test/TestDeconvolution.cs @@ -3,9 +3,6 @@ using MassSpectrometry; using MzLibUtil; using NUnit.Framework; -using Assert = NUnit.Framework.Legacy.ClassicAssert; -using Proteomics; -using Proteomics.ProteolyticDigestion; using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; @@ -14,6 +11,8 @@ using System.Linq; using Omics.Digestion; using Omics.Modifications; +using Proteomics; +using Proteomics.ProteolyticDigestion; using Test.FileReadingTests; namespace Test @@ -26,15 +25,18 @@ public sealed class TestDeconvolution #region Old Deconvolution [Test] - [TestCase(586.2143122, 24, 41983672, 586.2)]//This is a lesser abundant charge state envelope at the low mz end - [TestCase(740.372202090153, 19, 108419280, 740.37)]//This is the most abundant charge state envelope - [TestCase(1081.385183, 13, 35454636, 1081.385)]//This is a lesser abundant charge state envelope at the high mz end - public void TestDeconvolutionProteoformMultiChargeState(double selectedIonMz, int selectedIonChargeStateGuess, double selectedIonIntensity, double isolationMz) + [TestCase(586.2143122, 24, 41983672, 586.2)] //This is a lesser abundant charge state envelope at the low mz end + [TestCase(740.372202090153, 19, 108419280, 740.37)] //This is the most abundant charge state envelope + [TestCase(1081.385183, 13, 35454636, + 1081.385)] //This is a lesser abundant charge state envelope at the high mz end + public void TestDeconvolutionProteoformMultiChargeState(double selectedIonMz, int selectedIonChargeStateGuess, + double selectedIonIntensity, double isolationMz) { MsDataScan[] Scans = new MsDataScan[1]; //txt file, not mgf, because it's an MS1. Most intense proteoform has mass of ~14037.9 Da - string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); + string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); string[] spectrumLines = File.ReadAllLines(Ms1SpectrumPath); @@ -51,7 +53,9 @@ public void TestDeconvolutionProteoformMultiChargeState(double selectedIonMz, in MzSpectrum spectrum = new MzSpectrum(ms1mzs, ms1intensities, false); - Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); + Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), + "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, + selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); var myMsDataFile = new FakeMsDataFile(Scans); @@ -68,21 +72,24 @@ public void TestDeconvolutionProteoformMultiChargeState(double selectedIonMz, in [Test] [TestCase("APSGGKK", "12-18-17_frac7_calib_ms1_663_665.mzML", 2)] - [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] - [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] + [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", + "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] + [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", + "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] public static void CheckGetMostAbundantObservedIsotopicMass(string peptide, string file, int charge) { Protein test1 = new Protein(peptide, "Accession"); DigestionParams d = new DigestionParams(); - PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); + PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, + CleavageSpecificity.None, "", 0, new Dictionary(), 0); double m = pw.MostAbundantMonoisotopicMass.ToMz(charge); string singleScan = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", file); - var reader = MsDataFileReader.GetDataFile(singleScan); + var reader = MsDataFileReader.GetDataFile(singleScan); reader.LoadAllStaticData(); List singlescan = reader.GetAllScansList(); - + MzSpectrum singlespec = singlescan[0].MassSpectrum; MzRange singleRange = new MzRange(singlespec.XArray.Min(), singlespec.XArray.Max()); int minAssumedChargeState = 1; @@ -91,13 +98,16 @@ public static void CheckGetMostAbundantObservedIsotopicMass(string peptide, stri double intensityRatioLimit = 3; //check assigned correctly - List lie2 = singlespec.Deconvolute(singleRange, minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); + List lie2 = singlespec.Deconvolute(singleRange, minAssumedChargeState, + maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); //check that if already assigned, skips assignment and just recalls same value - List lie3 = singlespec.Deconvolute(singleRange, minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); - Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); + List lie3 = singlespec.Deconvolute(singleRange, minAssumedChargeState, + maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); + Assert.That(lie2.Select(p => p.MostAbundantObservedIsotopicMass), + Is.EqualTo(lie3.Select(p => p.MostAbundantObservedIsotopicMass)).Within(.0005)); } #endregion @@ -105,15 +115,18 @@ public static void CheckGetMostAbundantObservedIsotopicMass(string peptide, stri #region Classic Deconvolution [Test] - [TestCase(586.2143122, 24, 41983672, 586.2)]//This is a lesser abundant charge state envelope at the low mz end - [TestCase(740.372202090153, 19, 108419280, 740.37)]//This is the most abundant charge state envelope - [TestCase(1081.385183, 13, 35454636, 1081.385)]//This is a lesser abundant charge state envelope at the high mz end - public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIonMz, int selectedIonChargeStateGuess, double selectedIonIntensity, double isolationMz) + [TestCase(586.2143122, 24, 41983672, 586.2)] //This is a lesser abundant charge state envelope at the low mz end + [TestCase(740.372202090153, 19, 108419280, 740.37)] //This is the most abundant charge state envelope + [TestCase(1081.385183, 13, 35454636, + 1081.385)] //This is a lesser abundant charge state envelope at the high mz end + public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIonMz, + int selectedIonChargeStateGuess, double selectedIonIntensity, double isolationMz) { MsDataScan[] Scans = new MsDataScan[1]; //txt file, not mgf, because it's an MS1. Most intense proteoform has mass of ~14037.9 Da - string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); + string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, + @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); string[] spectrumLines = File.ReadAllLines(Ms1SpectrumPath); @@ -130,7 +143,9 @@ public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIo MzSpectrum spectrum = new MzSpectrum(ms1mzs, ms1intensities, false); - Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); + Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), + "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, + selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); var myMsDataFile = new FakeMsDataFile(Scans); @@ -141,7 +156,8 @@ public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIo DeconvolutionParameters deconParameters = new ClassicDeconvolutionParameters(1, 60, 4, 3); List isolatedMasses = scan.GetIsolatedMassesAndCharges(scan, deconParameters).ToList(); - List isolatedMasses2 = scan.GetIsolatedMassesAndCharges(scan.MassSpectrum, deconParameters).ToList(); + List isolatedMasses2 = + scan.GetIsolatedMassesAndCharges(scan.MassSpectrum, deconParameters).ToList(); List monoIsotopicMasses = isolatedMasses.Select(m => m.MonoisotopicMass).ToList(); List monoIsotopicMasses2 = isolatedMasses2.Select(m => m.MonoisotopicMass).ToList(); @@ -154,13 +170,16 @@ public void TestClassicDeconvolutionProteoformMultiChargeState(double selectedIo [Test] [TestCase("APSGGKK", "12-18-17_frac7_calib_ms1_663_665.mzML", 2)] - [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] - [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] + [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", + "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] + [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", + "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptide, string file, int charge) { Protein test1 = new Protein(peptide, "Accession"); DigestionParams d = new DigestionParams(); - PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); + PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, + CleavageSpecificity.None, "", 0, new Dictionary(), 0); double m = pw.MostAbundantMonoisotopicMass.ToMz(charge); string singleScan = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", file); @@ -176,7 +195,8 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid double intensityRatioLimit = 3; DeconvolutionParameters deconParameters = - new ClassicDeconvolutionParameters(minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, + new ClassicDeconvolutionParameters(minAssumedChargeState, maxAssumedChargeState, + deconvolutionTolerancePpm, intensityRatioLimit); //check assigned correctly @@ -187,11 +207,9 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid //check that if already assigned, skips assignment and just recalls same value List lie3 = Deconvoluter.Deconvolute(singlespec, deconParameters, singleRange).ToList(); - Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); + Assert.That(lie2.Select(p => p.MostAbundantObservedIsotopicMass), Is.EqualTo(lie3.Select(p => p.MostAbundantObservedIsotopicMass))); } - #endregion - [Test] [TestCase(373.85, -5, 1874.28)] // GUAGUC -5 [TestCase(467.57, -4, 1874.28)] // GUAGUC -4 @@ -217,29 +235,150 @@ public void TestNegativeModeClassicDeconvolution(double expectedMz, int expected envelope.Peaks.Any(peak => tolerance.Within(peak.mz, expectedMz))); if (resultsWithPeakOfInterest is null) Assert.Fail(); - Assert.That(tolerance.Within(expectedMonoMass, resultsWithPeakOfInterest.MonoisotopicMass)); + Assert.That(expectedMonoMass, Is.EqualTo(resultsWithPeakOfInterest.MonoisotopicMass).Within(0.01)); Assert.That(expectedCharge, Is.EqualTo(resultsWithPeakOfInterest.Charge)); } + #endregion + + #region IsoDec Deconvolution + + [Test] + [TestCase(586.2143122, 24, 41983672, 586.2)]//This is a lesser abundant charge state envelope at the low mz end + [TestCase(740.372202090153, 19, 108419280, 740.37)]//This is the most abundant charge state envelope + [TestCase(1081.385183, 13, 35454636, 1081.385)]//This is a lesser abundant charge state envelope at the high mz end + public void TestIsoDecDeconvolutionProteoformMultiChargeState(double selectedIonMz, int selectedIonChargeStateGuess, double selectedIonIntensity, double isolationMz) + { + MsDataScan[] Scans = new MsDataScan[1]; + + //txt file, not mgf, because it's an MS1. Most intense proteoform has mass of ~14037.9 Da + string Ms1SpectrumPath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"DataFiles\14kDaProteoformMzIntensityMs1.txt"); + + string[] spectrumLines = File.ReadAllLines(Ms1SpectrumPath); + + int mzIntensityPairsCount = spectrumLines.Length; + double[] ms1mzs = new double[mzIntensityPairsCount]; + double[] ms1intensities = new double[mzIntensityPairsCount]; + + for (int i = 0; i < mzIntensityPairsCount; i++) + { + string[] pair = spectrumLines[i].Split('\t'); + ms1mzs[i] = Convert.ToDouble(pair[0], CultureInfo.InvariantCulture); + ms1intensities[i] = Convert.ToDouble(pair[1], CultureInfo.InvariantCulture); + } + + MzSpectrum spectrum = new MzSpectrum(ms1mzs, ms1intensities, false); + + Scans[0] = new MsDataScan(spectrum, 1, 1, false, Polarity.Positive, 1.0, new MzRange(495, 1617), "first spectrum", MZAnalyzerType.Unknown, spectrum.SumOfAllY, null, null, null, selectedIonMz, selectedIonChargeStateGuess, selectedIonIntensity, isolationMz, 4); + + var myMsDataFile = new FakeMsDataFile(Scans); + + MsDataScan scan = myMsDataFile.GetAllScansList()[0]; + + // The ones marked 2 are for checking an overload method + + DeconvolutionParameters deconParameters = new IsoDecDeconvolutionParameters(); + + IsoDecAlgorithm alg = new IsoDecAlgorithm(deconParameters); + List allMasses = alg.Deconvolute(scan.MassSpectrum, new MzRange((double)scan.MassSpectrum.FirstX, (double)scan.MassSpectrum.LastX)).ToList(); + + List isolatedMasses = scan.GetIsolatedMassesAndCharges(scan, deconParameters).ToList(); + List isolatedMasses2 = scan.GetIsolatedMassesAndCharges(scan.MassSpectrum, deconParameters).ToList(); + + List monoIsotopicMasses = isolatedMasses.Select(m => m.MonoisotopicMass).ToList(); + List monoIsotopicMasses2 = isolatedMasses2.Select(m => m.MonoisotopicMass).ToList(); + Assert.That(monoIsotopicMasses2.Count, Is.EqualTo(monoIsotopicMasses.Count)); + + //The primary monoisotopic mass should be the same regardless of which peak in which charge state was selected for isolation. + //this case is interesting because other monoisotopic mass may have a sodium adduct. The unit test could be expanded to consider this. + //Updated the tolerance on this test to be 5 ppm (which felt reasonable to me? --JGP) + double ppmwidth = (14037.926829 / 1e6) * 5; + bool isAnyEqual1 = monoIsotopicMasses.Any(m => m >= 14037.926829 - ppmwidth && m <= 14037.926826 + ppmwidth); + bool isAnyEqual2 = monoIsotopicMasses2.Any(m => m >= 14037.926829 - ppmwidth && m <= 14037.926826 + ppmwidth); + Assert.That(isAnyEqual1); + Assert.That(isAnyEqual2); + } + + [Test] + [TestCase("APSGGKK", "12-18-17_frac7_calib_ms1_663_665.mzML", 2)] + [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] + [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] + public static void CheckIsoDecGetMostAbundantObservedIsotopicMass(string peptide, string file, int charge) + { + Protein test1 = new Protein(peptide, "Accession"); + DigestionParams d = new DigestionParams(); + PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); + double m = pw.MostAbundantMonoisotopicMass.ToMz(charge); + + string singleScan = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", file); + Mzml singleMZML = (Mzml)MsDataFileReader.GetDataFile(singleScan).LoadAllStaticData(); + + List singlescan = singleMZML.GetAllScansList(); + + MzSpectrum singlespec = singlescan[0].MassSpectrum; + MzRange singleRange = new MzRange(singlespec.XArray.Min(), singlespec.XArray.Max()); + DeconvolutionParameters deconParameters = new IsoDecDeconvolutionParameters(); + + //check assigned correctly + List lie2 = Deconvoluter.Deconvolute(singlespec, deconParameters, singleRange).ToList(); + List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); + Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); + + //check that if already assigned, skips assignment and just recalls same value + List lie3 = Deconvoluter.Deconvolute(singlespec, deconParameters, singleRange).ToList(); + Assert.That(lie2.Select(p => p.MostAbundantObservedIsotopicMass), Is.EqualTo(lie3.Select(p => p.MostAbundantObservedIsotopicMass))); + } + + [Test] + [TestCase(373.85, -5, 1874.28)] // GUAGUC -5 + [TestCase(936.13, -2, 1874.28)] // GUAGUC -2 + [TestCase(473.05, -4, 1896.26)] // GUAGUC +Na -H -4 + [TestCase(631.07, -3, 1896.26)] // GUAGUC +Na -H -3 + [TestCase(947.121, -2, 1896.26)] // GUAGUC +Na -H -2 + public void TestNegativeModeIsoDecDeconvolution(double expectedMz, int expectedCharge, double expectedMonoMass) + { + // get scan + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", + "GUACUG_NegativeMode_Sliced.mzML"); + var scan = MsDataFileReader.GetDataFile(filePath).GetAllScansList().First(); + var tolerance = new PpmTolerance(20); + + // set up deconvolution + DeconvolutionParameters deconParams = new IsoDecDeconvolutionParameters(Polarity.Negative); + + List deconvolutionResults = Deconvoluter.Deconvolute(scan, deconParams).ToList(); + // ensure each expected result is found, with correct mz, charge, and monoisotopic mass + /*var resultsWithPeakOfInterest = deconvolutionResults.FirstOrDefault(envelope => tolerance.Within(envelope.MonoisotopicMass, expectedMonoMass));*/ + var resultsWithPeakOfInterest = deconvolutionResults.FirstOrDefault(envelope => + envelope.Peaks.Any(peak => tolerance.Within(peak.mz, expectedMz))); + if (resultsWithPeakOfInterest is null) Assert.Fail(); + + Assert.That(expectedMonoMass, Is.EqualTo(resultsWithPeakOfInterest.MonoisotopicMass).Within(0.01)); + Assert.That(expectedCharge, Is.EqualTo(resultsWithPeakOfInterest.Charge)); + } + + #endregion + [Test] public static void TestExampleNewDeconvolutionInDeconvoluter() { DeconvolutionParameters deconParams = new ExampleNewDeconvolutionParametersTemplate(1, 60); - var dataFile = MsDataFileReader.GetDataFile(Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "GUACUG_NegativeMode_Sliced.mzML")); + var dataFile = MsDataFileReader.GetDataFile(Path.Combine(TestContext.CurrentContext.TestDirectory, + "DataFiles", "GUACUG_NegativeMode_Sliced.mzML")); dataFile.InitiateDynamicConnection(); var scan = dataFile.GetOneBasedScanFromDynamicConnection(726); var spectrum = scan.MassSpectrum; dataFile.CloseDynamicConnection(); // test switch statements in Deconvoluter - Assert.Throws(() => Deconvoluter.Deconvolute(spectrum, deconParams)); - Assert.Throws(() => Deconvoluter.Deconvolute(scan, deconParams)); + Assert.Throws(() => _ = Deconvoluter.Deconvolute(spectrum, deconParams).ToList()); + Assert.Throws(() => _ = Deconvoluter.Deconvolute(scan, deconParams).ToList()); // test default exceptions in deconvoluter var badEnumValue = (DeconvolutionType)Int32.MaxValue; deconParams.GetType().GetProperty("DeconvolutionType")!.SetValue(deconParams, badEnumValue); - Assert.Throws(() => Deconvoluter.Deconvolute(spectrum, deconParams)); - Assert.Throws(() => Deconvoluter.Deconvolute(scan, deconParams)); + Assert.Throws(() => _ = Deconvoluter.Deconvolute(spectrum, deconParams).ToList()); + Assert.Throws(() => _ = Deconvoluter.Deconvolute(scan, deconParams).ToList()); } @@ -247,14 +386,15 @@ public static void TestExampleNewDeconvolutionInDeconvoluter() public static void Test_MsDataScan_GetIsolatedMassesAndCharges() { // get scan - string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "GUACUG_NegativeMode_Sliced.mzML"); + string filePath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", + "GUACUG_NegativeMode_Sliced.mzML"); var dataFile = MsDataFileReader.GetDataFile(filePath); var precursorScan = dataFile.GetOneBasedScan(1); var fragmentationScan = dataFile.GetOneBasedScan(2); // set up deconvolution DeconvolutionParameters deconParams = new ClassicDeconvolutionParameters(-10, -1, 20, 3, Polarity.Negative); - + // get isolated masses and charges on an MS1 scan. This means the isolation window is null. var ms1Result = precursorScan.GetIsolatedMassesAndCharges(precursorScan.MassSpectrum, deconParams).ToList(); Assert.That(ms1Result.Count, Is.EqualTo(0)); @@ -262,10 +402,132 @@ public static void Test_MsDataScan_GetIsolatedMassesAndCharges() Assert.That(ms1Result.Count, Is.EqualTo(0)); // get isolated masses and charges on an MS2 scan. This should work correctly - var ms2Result = fragmentationScan.GetIsolatedMassesAndCharges(precursorScan.MassSpectrum, deconParams).ToList(); + var ms2Result = fragmentationScan.GetIsolatedMassesAndCharges(precursorScan.MassSpectrum, deconParams) + .ToList(); Assert.That(ms2Result.Count, Is.EqualTo(1)); ms2Result = fragmentationScan.GetIsolatedMassesAndCharges(precursorScan, deconParams).ToList(); Assert.That(ms2Result.Count, Is.EqualTo(1)); } + + [Test] + public void NeutralMassSpectrum_Deconvolute_AllInRange() + { + // Arrange + var xArray = new[] { 260.774188159546, 391.660998843979 }; + var yArray = new[] { 1000.0, 1.0 }; + var charges = new[] { 1, 1 }; + var spectrum = new NeutralMassSpectrum(xArray, yArray, charges, false); + var deconvolutionParameters = new ClassicDeconvolutionParameters(1, 60, 20, 2); + var rangeToGetPeaksFrom = new MzRange(260.0, 400.0); + + // Act + var result = Deconvoluter.Deconvolute(spectrum, deconvolutionParameters, rangeToGetPeaksFrom).ToList(); + + // Assert + Assert.That(result, Is.Not.Null); + Assert.That(result, Is.InstanceOf>()); + Assert.That(result.Count(), Is.EqualTo(2)); + + for (int i = 0; i < result.Count(); i++) + { + Assert.That(result[i].MonoisotopicMass, Is.EqualTo(xArray[i])); + Assert.That(result[i].TotalIntensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Peaks.Count, Is.EqualTo(1)); + Assert.That(result[i].Peaks.First().mz, Is.EqualTo(xArray[i].ToMz(charges[i]))); + Assert.That(result[i].Peaks.First().intensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Charge, Is.EqualTo(charges[i])); + } + } + + + [Test] + public void NeutralMassSpectrum_Deconvolute_AllInRange_Charged() + { + // Arrange + var xArray = new[] { 260.774188159546, 391.660998843979 }; + var yArray = new[] { 1000.0, 1.0 }; + var charges = new[] { 3, 3 }; + var spectrum = new NeutralMassSpectrum(xArray, yArray, charges, false); + var deconvolutionParameters = new ClassicDeconvolutionParameters(1, 60, 20, 2); + var rangeToGetPeaksFrom = new MzRange(00, 200.0); + + // Act + var result = Deconvoluter.Deconvolute(spectrum, deconvolutionParameters, rangeToGetPeaksFrom).ToList(); + + // Assert + Assert.That(result, Is.Not.Null); + Assert.That(result, Is.InstanceOf>()); + Assert.That(result.Count(), Is.EqualTo(2)); + + for (int i = 0; i < result.Count(); i++) + { + Assert.That(result[i].MonoisotopicMass, Is.EqualTo(xArray[i])); + Assert.That(result[i].TotalIntensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Peaks.Count, Is.EqualTo(1)); + Assert.That(result[i].Peaks.First().mz, Is.EqualTo(xArray[i].ToMz(charges[i]))); + Assert.That(result[i].Peaks.First().intensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Charge, Is.EqualTo(charges[i])); + } + } + + [Test] + public void NeutralMassSpectrum_Deconvolute_SomeInRange() + { + // Arrange + var xArray = new[] { 260.774188159546, 391.660998843979 }; + var yArray = new[] { 1000.0, 1.0 }; + var charges = new[] { 1, 1 }; + var spectrum = new NeutralMassSpectrum(xArray, yArray, charges, false); + var deconvolutionParameters = new ClassicDeconvolutionParameters(1, 60, 20, 2); + var rangeToGetPeaksFrom = new MzRange(260.0, 300.0); + + // Act + var result = Deconvoluter.Deconvolute(spectrum, deconvolutionParameters, rangeToGetPeaksFrom).ToList(); + + // Assert + Assert.That(result, Is.Not.Null); + Assert.That(result, Is.InstanceOf>()); + Assert.That(result.Count(), Is.EqualTo(1)); + + for (int i = 0; i < result.Count(); i++) + { + Assert.That(result[i].MonoisotopicMass, Is.EqualTo(xArray[i])); + Assert.That(result[i].TotalIntensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Peaks.Count, Is.EqualTo(1)); + Assert.That(result[i].Peaks.First().mz, Is.EqualTo(xArray[i].ToMz(charges[i]))); + Assert.That(result[i].Peaks.First().intensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Charge, Is.EqualTo(charges[i])); + } + } + + [Test] + public void NeutralMassSpectrum_Deconvolute_SomeInRange_Charged() + { + // Arrange + var xArray = new[] { 260.774188159546, 391.660998843979 }; + var yArray = new[] { 1000.0, 1.0 }; + var charges = new[] { 1, 20 }; + var spectrum = new NeutralMassSpectrum(xArray, yArray, charges, false); + var deconvolutionParameters = new ClassicDeconvolutionParameters(1, 60, 20, 2); + var rangeToGetPeaksFrom = new MzRange(260.0, 300.0); + + // Act + var result = Deconvoluter.Deconvolute(spectrum, deconvolutionParameters, rangeToGetPeaksFrom).ToList(); + + // Assert + Assert.That(result, Is.Not.Null); + Assert.That(result, Is.InstanceOf>()); + Assert.That(result.Count(), Is.EqualTo(1)); + + for (int i = 0; i < result.Count(); i++) + { + Assert.That(result[i].MonoisotopicMass, Is.EqualTo(xArray[i])); + Assert.That(result[i].TotalIntensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Peaks.Count, Is.EqualTo(1)); + Assert.That(result[i].Peaks.First().mz, Is.EqualTo(xArray[i].ToMz(charges[i]))); + Assert.That(result[i].Peaks.First().intensity, Is.EqualTo(yArray[i])); + Assert.That(result[i].Charge, Is.EqualTo(charges[i])); + } + } } -} \ No newline at end of file +} diff --git a/mzLib/Test/TestDigestionMotif.cs b/mzLib/Test/TestDigestionMotif.cs index 39b7a1704..b2240d62c 100644 --- a/mzLib/Test/TestDigestionMotif.cs +++ b/mzLib/Test/TestDigestionMotif.cs @@ -595,5 +595,49 @@ public void TestDigestionParamsMaskedProperties() digestionParams.MaxModsForPeptide = 3; Assert.That(digestionParams.MaxMods, Is.EqualTo(digestionParams.MaxModsForPeptide)); } + + private class TestDigestionAgent : DigestionAgent + { + public TestDigestionAgent(string name, CleavageSpecificity cleavageSpecificity, List motifList, Modification cleavageMod) + : base(name, cleavageSpecificity, motifList, cleavageMod) + { + } + } + + [Test] + public void Equals_SameName_ReturnsTrue() + { + var agent1 = ProteaseDictionary.Dictionary["trypsin"]; + var agent2 = ProteaseDictionary.Dictionary["trypsin"]; + + Assert.That(agent1.Equals(agent2), Is.True); + } + + [Test] + public void Equals_DifferentName_ReturnsFalse() + { + var agent1 = ProteaseDictionary.Dictionary["trypsin"]; + var agent2 = ProteaseDictionary.Dictionary["Arg-C"]; + + Assert.That(agent1.Equals(agent2), Is.False); + } + + [Test] + public void GetHashCode_SameName_ReturnsSameHashCode() + { + var agent1 = ProteaseDictionary.Dictionary["trypsin"]; + var agent2 = ProteaseDictionary.Dictionary["trypsin"]; + + Assert.That(agent1.GetHashCode(), Is.EqualTo(agent2.GetHashCode())); + } + + [Test] + public void GetHashCode_DifferentName_ReturnsDifferentHashCode() + { + var agent1 = ProteaseDictionary.Dictionary["trypsin"]; + var agent2 = ProteaseDictionary.Dictionary["Arg-C"]; + + Assert.That(agent1.GetHashCode(), Is.Not.EqualTo(agent2.GetHashCode())); + } } } \ No newline at end of file diff --git a/mzLib/Test/TestModifications.cs b/mzLib/Test/TestModifications.cs index 021f08dc2..b1a25f91c 100644 --- a/mzLib/Test/TestModifications.cs +++ b/mzLib/Test/TestModifications.cs @@ -802,5 +802,78 @@ public static void TestUniprotResidualMod() Assert.That(peptide.FullSequence == "PEPT[UniProt:acetylation on T]IDE"); } + + [Test] + public void CompareTo_SameModification_ReturnsZero() + { + ModificationMotif.TryGetMotif("A", out var motif); + var chemicalFormula = new ChemicalFormula(); + var mod1 = new Modification("mod1", "accession1", "type1", "feature1", motif, "N-terminal.", chemicalFormula, 100.0); + var mod2 = new Modification("mod1", "accession1", "type1", "feature1", motif, "N-terminal.", chemicalFormula, 100.0); + + NUnit.Framework.Assert.That(mod1.CompareTo(mod2), Is.EqualTo(0)); + } + + [Test] + public void CompareTo_DifferentIdWithMotif_ReturnsNonZero() + { + ModificationMotif.TryGetMotif("A", out var motif); + var chemicalFormula = new ChemicalFormula(); + var mod1 = new Modification("mod1", "accession1", "type1", "feature1", motif, "N-terminal.", chemicalFormula, 100.0); + var mod2 = new Modification("mod2", "accession1", "type1", "feature1", motif, "N-terminal.", chemicalFormula, 100.0); + + NUnit.Framework.Assert.That(mod1.CompareTo(mod2), Is.LessThan(0)); + NUnit.Framework.Assert.That(mod2.CompareTo(mod1), Is.GreaterThan(0)); + } + + [Test] + public void CompareTo_DifferentModificationType_ReturnsNonZero() + { + ModificationMotif.TryGetMotif("A", out var motif); + var chemicalFormula = new ChemicalFormula(); + var mod1 = new Modification("mod1", "accession1", "type1", "feature1", motif, "N-terminal.", chemicalFormula, 100.0); + var mod2 = new Modification("mod1", "accession1", "type2", "feature1", motif, "N-terminal.", chemicalFormula, 100.0); + + NUnit.Framework.Assert.That(mod1.CompareTo(mod2), Is.LessThan(0)); + NUnit.Framework.Assert.That(mod2.CompareTo(mod1), Is.GreaterThan(0)); + } + + [Test] + public void CompareTo_DifferentTarget_ReturnsNonZero() + { + ModificationMotif.TryGetMotif("A", out var motif1); + ModificationMotif.TryGetMotif("B", out var motif2); + var chemicalFormula = new ChemicalFormula(); + var mod1 = new Modification("mod1", "accession1", "type1", "feature1", motif1, "N-terminal.", chemicalFormula, 100.0); + var mod2 = new Modification("mod1", "accession1", "type1", "feature1", motif2, "N-terminal.", chemicalFormula, 100.0); + + NUnit.Framework.Assert.That(mod1.CompareTo(mod2), Is.LessThan(0)); + NUnit.Framework.Assert.That(mod2.CompareTo(mod1), Is.GreaterThan(0)); + } + + [Test] + public void CompareTo_DifferentLocationRestriction_ReturnsNonZero() + { + ModificationMotif.TryGetMotif("A", out var motif); + var chemicalFormula = new ChemicalFormula(); + var mod1 = new Modification("mod1", "accession1", "type1", "feature1", motif, "C-terminal.", chemicalFormula, 100.0); + var mod2 = new Modification("mod1", "accession1", "type1", "feature1", motif, "N-terminal.", chemicalFormula, 100.0); + + NUnit.Framework.Assert.That(mod1.CompareTo(mod2), Is.LessThan(0)); + NUnit.Framework.Assert.That(mod2.CompareTo(mod1), Is.GreaterThan(0)); + } + + [Test] + public void CompareTo_DifferentMonoisotopicMass_ReturnsNonZero() + { + ModificationMotif.TryGetMotif("A", out var motif); + var chemicalFormula = new ChemicalFormula(); + var mod1 = new Modification("mod1", "accession1", "type1", "feature1", motif, "N-terminal.", chemicalFormula, 100.0); + var mod2 = new Modification("mod1", "accession1", "type1", "feature1", motif, "N-terminal.", chemicalFormula, 101.0); + + NUnit.Framework.Assert.That(mod1.CompareTo(mod2), Is.LessThan(0)); + NUnit.Framework.Assert.That(mod2.CompareTo(mod1), Is.GreaterThan(0)); + NUnit.Framework.Assert.That(mod2.CompareTo(null), Is.EqualTo(1)); + } } } \ No newline at end of file diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index e2864c1e6..f19144a42 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -1,6 +1,11 @@ using NUnit.Framework; using Assert = NUnit.Framework.Legacy.ClassicAssert; using MzLibUtil; +using Readers; +using System.Windows.Documents; +using System.Collections.Generic; +using System; +using System.Security.Cryptography.X509Certificates; namespace Test { @@ -18,7 +23,7 @@ public sealed class TestMzLibUtil [TestCase(@"C:\Users\bubba\Documents.docs\Projects\K562\K562_2\20100730_Velos1_TaGe_SA_K565_4.raw", "20100730_Velos1_TaGe_SA_K565_4")] //test extra period in filename [TestCase(@"C:\Users\bubba\Documents\Projects\K562\K562_2\20100730_Velos1_.TaGe_SA_K565_4.raw", "20100730_Velos1_.TaGe_SA_K565_4")] - [TestCase("/home/seth/Pictures/penguin.jpg","penguin")] + [TestCase("/home/seth/Pictures/penguin.jpg", "penguin")] [TestCase("/home/seth/Pictures/penguin", "penguin")] [TestCase("penguin.jpg", "penguin")] [TestCase("penguin", "penguin")] @@ -32,5 +37,171 @@ public static void TestPeriodTolerantFilenameWithoutExtension(string filenameAnd Assert.AreEqual(expectedResult, result); Assert.AreEqual(expectedResult, extensionResult); } + [Test] + public static void TestParseModificationsOneMod() + { + string fullSeq = "DM[Common Variable:Oxidation on M]MELVQPSISGVDLDK"; + var mods = fullSeq.ParseModifications(); + Assert.That(mods.Count == 1); + Assert.That(mods.ContainsKey(2)); + Assert.That(mods[2].Count == 1); + Assert.That(mods[2].Contains("Common Variable:Oxidation on M")); + } + public static void TestParseModificationsTwoModsZeroIndexing() + { + // sequence with two terminal mods with indexed termini (zero-based indexing) + string fullSeq = "[UniProt:N-acetylglutamate on E]EEEIAALVID[Metal:Calcium on D]NGSGMC[Common Fixed:Carbamidomethyl on C]"; + var mods = fullSeq.ParseModifications(true, true); + Assert.That(mods.Count == 3); + Assert.That(mods.ContainsKey(0)); + Assert.That(mods.ContainsKey(10)); + Assert.That(mods.ContainsKey(17)); + Assert.That(mods[0].Count == 1); + Assert.That(mods[10].Count == 1); + Assert.That(mods[17].Count == 1); + Assert.That(mods[0].Contains("UniProt:N-acetylglutamate on E")); + Assert.That(mods[10].Contains("Metal:Calcium on D")); + Assert.That(mods[17].Contains("Common Fixed:Carbamidomethyl on C")); + } + public static void TestParseModificationsTwoModsOneIndexing() + { + // sequence with two terminal mods with termini indexed at amino acid positions (one-based indexing) + string fullSeq = "[UniProt:N-acetylglutamate on E]EEEIAALVID[Metal:Calcium on D]NGSGMC[Common Fixed:Carbamidomethyl on C]"; + var mods = fullSeq.ParseModifications(); + Assert.That(mods.Count == 3); + Assert.That(mods.ContainsKey(1)); + Assert.That(mods.ContainsKey(10)); + Assert.That(mods.ContainsKey(16)); + Assert.That(mods[1].Count == 1); + Assert.That(mods[10].Count == 1); + Assert.That(mods[16].Count == 1); + Assert.That(mods[1].Contains("UniProt:N-acetylglutamate on E")); + Assert.That(mods[10].Contains("Metal:Calcium on D")); + Assert.That(mods[16].Contains("Common Fixed:Carbamidomethyl on C")); + } + public static void TestParseModificationsTwoModsSameTerminusZeroIndexing() + { + // sequence with two mods on same terminus with with indexed termini (zero-based indexing) + string fullSeq = "[UniProt:N-acetylglutamate on E]|[Common Artifact:Water Loss on E]EEEIAALVID[Metal:Calcium on D]NGSGMC[Common Fixed:Carbamidomethyl on C]"; + var mods = fullSeq.ParseModifications(true, true); + Assert.That(mods.Count == 3); + Assert.That(mods.ContainsKey(0)); + Assert.That(mods.ContainsKey(10)); + Assert.That(mods.ContainsKey(17)); + Assert.That(mods[0].Count == 2); + Assert.That(mods[10].Count == 1); + Assert.That(mods[17].Count == 1); + Assert.That(mods[0].Contains("UniProt:N-acetylglutamate on E")); + Assert.That(mods[0].Contains("Common Artifact:Water Loss on E")); + Assert.That(mods[10].Contains("Metal:Calcium on D")); + Assert.That(mods[17].Contains("Common Fixed:Carbamidomethyl on C")); + } + public static void TestParseModificationsTwoModsSameTerminusOneIndexing() + { + // sequence with two mods on same terminus with termini indexed at amino acid positions (one-based indexing) + string fullSeq = "[UniProt:N-acetylglutamate on E]|[Common Artifact:Water Loss on E]EEEIAALVID[Metal:Calcium on D]NGSGMC[Common Fixed:Carbamidomethyl on C]"; + var mods = fullSeq.ParseModifications(); + Assert.That(mods.Count == 3); + Assert.That(mods.ContainsKey(1)); + Assert.That(mods.ContainsKey(10)); + Assert.That(mods.ContainsKey(16)); + Assert.That(mods[1].Count == 2); + Assert.That(mods[10].Count == 1); + Assert.That(mods[16].Count == 1); + Assert.That(mods[1].Contains("UniProt:N-acetylglutamate on E")); + Assert.That(mods[1].Contains("Common Artifact:Water Loss on E")); + Assert.That(mods[10].Contains("Metal:Calcium on D")); + Assert.That(mods[16].Contains("Common Fixed:Carbamidomethyl on C")); + } + public static void TestParseModificationsTwoModsTerminusAndSideChain() + { + // sequence with mod on N terminus and mod on first amino acid side chain + string fullSeq = "[UniProt:N-acetylglutamate on E]E[Metal:Sodium on E]EEIAALVID[Metal:Calcium on D]NGSGMC[Common Fixed:Carbamidomethyl on C]"; + var mods = fullSeq.ParseModifications(); + Assert.That(mods.Count == 3); + Assert.That(mods.ContainsKey(1)); + Assert.That(mods.ContainsKey(10)); + Assert.That(mods.ContainsKey(16)); + Assert.That(mods[1].Count == 2); + Assert.That(mods[10].Count == 1); + Assert.That(mods[16].Count == 1); + Assert.That(mods[1].Contains("UniProt:N-acetylglutamate on E")); + Assert.That(mods[1].Contains("Metal:Sodium on E")); + Assert.That(mods[10].Contains("Metal:Calcium on D")); + Assert.That(mods[16].Contains("Common Fixed:Carbamidomethyl on C")); + } + + [Test] + public void TestPeptidePTMOccupancy() + { + List sequences = new List(); + sequences.Add("[UniProt: N - acetylglutamate on E]EEEIAALVID[Metal: Calcium on D]NGSGMC[Common Fixed: Carbamidomethyl on C]K"); + sequences.Add("[UniProt: N - acetylglutamate on E]EEEIAALVID[Metal: Sodium on D]NGSGMC[Common Fixed: Carbamidomethyl on C]K"); + sequences.Add("[UniProt: N - acetylglutamate on E]EEEIAALVIDN[Common Artifact: Ammonia loss on N]GSGMC[Common Fixed: Carbamidomethyl on C]K"); + sequences.Add("[UniProt: N - acetylglutamate on E]EEEIAALVIDN[Common Biological: Hydroxylation on N]GSGMC[Common Fixed: Carbamidomethyl on C]K"); + sequences.Add("[UniProt: N - acetylglutamate on E]EEEIAALVIDNGSGM[Common Variable: Oxidation on M]C[Common Fixed: Carbamidomethyl on C]K"); + sequences.Add("[UniProt: N - acetylglutamate on E]EEEIAALVIDNGSGMC[Common Fixed: Carbamidomethyl on C]K"); + + string baseSeq = "EEEIAALVIDNGSGMCK"; + + List pgs = new List(); + pgs.Add("pg1"); + pgs.Add("pg2|pg3"); + + var peptides = new List, double>>(); + foreach (var seq in sequences) + { + peptides.Add(Tuple.Create(seq, baseSeq, pgs, 1.0)); + } + + PositionFrequencyAnalysis pfa = new PositionFrequencyAnalysis(); + pfa.PeptidePTMOccupancy(peptides); + var occupancy = pfa.Occupancy; + + Assert.That(6.0 == occupancy["pg1"].Proteins["pg1"].Peptides[baseSeq].ModifiedAminoAcidPositions[0]["UniProt: N - acetylglutamate on E"].Intensity); + Assert.That(1.0 == occupancy["pg1"].Proteins["pg1"].Peptides[baseSeq].ModifiedAminoAcidPositions[10]["Metal: Calcium on D"].Intensity); + Assert.That(1.0 == occupancy["pg1"].Proteins["pg1"].Peptides[baseSeq].ModifiedAminoAcidPositions[10]["Metal: Sodium on D"].Intensity); + Assert.That(1.0 == occupancy["pg1"].Proteins["pg1"].Peptides[baseSeq].ModifiedAminoAcidPositions[11]["Common Artifact: Ammonia loss on N"].Intensity); + Assert.That(1.0 == occupancy["pg1"].Proteins["pg1"].Peptides[baseSeq].ModifiedAminoAcidPositions[11]["Common Biological: Hydroxylation on N"].Intensity); + Assert.That(1.0 == occupancy["pg1"].Proteins["pg1"].Peptides[baseSeq].ModifiedAminoAcidPositions[15]["Common Variable: Oxidation on M"].Intensity); + Assert.That(6.0 == occupancy["pg1"].Proteins["pg1"].Peptides[baseSeq].ModifiedAminoAcidPositions[16]["Common Fixed: Carbamidomethyl on C"].Intensity); + Assert.That(6.0 == occupancy["pg1"].Proteins["pg1"].Peptides[baseSeq].Intensity); + + Assert.That(6.0 == occupancy["pg2|pg3"].Proteins["pg2"].Peptides[baseSeq].ModifiedAminoAcidPositions[0]["UniProt: N - acetylglutamate on E"].Intensity); + Assert.That(1.0 == occupancy["pg2|pg3"].Proteins["pg2"].Peptides[baseSeq].ModifiedAminoAcidPositions[10]["Metal: Calcium on D"].Intensity); + Assert.That(1.0 == occupancy["pg2|pg3"].Proteins["pg2"].Peptides[baseSeq].ModifiedAminoAcidPositions[10]["Metal: Sodium on D"].Intensity); + Assert.That(1.0 == occupancy["pg2|pg3"].Proteins["pg2"].Peptides[baseSeq].ModifiedAminoAcidPositions[11]["Common Artifact: Ammonia loss on N"].Intensity); + Assert.That(1.0 == occupancy["pg2|pg3"].Proteins["pg2"].Peptides[baseSeq].ModifiedAminoAcidPositions[11]["Common Biological: Hydroxylation on N"].Intensity); + Assert.That(1.0 == occupancy["pg2|pg3"].Proteins["pg2"].Peptides[baseSeq].ModifiedAminoAcidPositions[15]["Common Variable: Oxidation on M"].Intensity); + Assert.That(6.0 == occupancy["pg2|pg3"].Proteins["pg2"].Peptides[baseSeq].ModifiedAminoAcidPositions[16]["Common Fixed: Carbamidomethyl on C"].Intensity); + Assert.That(6.0 == occupancy["pg2|pg3"].Proteins["pg2"].Peptides[baseSeq].Intensity); + } + + [Test] + public static void TestToEnum() + { + Assert.IsTrue(0.ToEnum(out var result)); + Assert.AreEqual(TimsTofMsMsType.MS, result); + + Assert.IsTrue(2.ToEnum(out result)); + Assert.AreEqual(TimsTofMsMsType.MSMSFragment, result); + + Assert.IsTrue(8.ToEnum(out result)); + Assert.AreEqual(TimsTofMsMsType.PASEF, result); + + Assert.IsTrue(9.ToEnum(out result)); + Assert.AreEqual(TimsTofMsMsType.DIA, result); + + Assert.IsTrue(10.ToEnum(out result)); + Assert.AreEqual(TimsTofMsMsType.PRM, result); + + Assert.IsTrue(0.ToEnum(out var result2)); + Assert.AreEqual(TimsTofAcquisitionMode.MS, result2); + + Assert.IsFalse(1.ToEnum(out result)); + Assert.IsFalse(11.ToEnum(out result)); + Assert.IsFalse(7.ToEnum(out result)); + + } } } diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index 355d9d27c..c2e19749a 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -8,10 +8,12 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using MzLibUtil; using Omics; using Omics.Digestion; using Omics.Fragmentation; using Omics.Modifications; +using Transcriptomics.Digestion; using UsefulProteomicsDatabases; using Stopwatch = System.Diagnostics.Stopwatch; @@ -55,8 +57,22 @@ public static void TestDifferentProteaseEquals() Assert.That(pep1.Parent.Equals(pep2.Parent)); Assert.That(!pep1.DigestionParams.DigestionAgent.Equals(pep2.DigestionParams.DigestionAgent)); Assert.That(!pep1.Equals(pep2)); - // HashCode is only concerned with the full sequence, not the protease. Only the equals method is interested in the protease used - Assert.That(pep1.GetHashCode().Equals(pep2.GetHashCode())); + Assert.That(!pep1.Equals((object)pep2)); + Assert.That(!pep1.GetHashCode().Equals(pep2.GetHashCode())); + } + + [Test] + public static void TestPeptideOligoEquality() + { + var oligo = new OligoWithSetMods("GUACUG", []); + var peptide = new PeptideWithSetModifications("PEPTIDE", []); + + Assert.That(!oligo.Equals(peptide)); + Assert.That(!peptide.Equals(oligo)); + Assert.That(!((IBioPolymerWithSetMods)oligo).Equals(peptide)); + Assert.That(!((IBioPolymerWithSetMods)peptide).Equals(oligo)); + Assert.That(!((object)oligo).Equals(peptide)); + Assert.That(!((object)peptide).Equals(oligo)); } [Test] @@ -762,12 +778,11 @@ public static void TestReverseDecoyFromTarget() int[] newAminoAcidPositions = new int["PEPTIDEK".Length]; PeptideWithSetModifications reverse = p.GetReverseDecoyFromTarget(newAminoAcidPositions); - // Hash code corresponding to the target sequence, should be PairedTargetDecoyHash for reverse - int testTargetHash = p.GetHashCode(); - // Hash code corresponding to the decoy sequence, should be PairedTargetDecoyHash for target - int testDecoyHash = reverse.GetHashCode(); - Assert.AreEqual(reverse.PairedTargetDecoySequence.GetHashCode(), testTargetHash); - Assert.AreEqual(p.PairedTargetDecoySequence.GetHashCode(), testDecoyHash); + + string targetSequence = p.FullSequence; + string decoySequence = reverse.FullSequence; + Assert.AreEqual(reverse.PairedTargetDecoySequence, targetSequence); + Assert.AreEqual(p.PairedTargetDecoySequence, decoySequence); Assert.AreEqual("EDITPEPK", reverse.BaseSequence); Assert.AreEqual(new int[] { 6, 5, 4, 3, 2, 1, 0, 7 }, newAminoAcidPositions); Assert.IsTrue(reverse.Protein.IsDecoy); @@ -839,11 +854,11 @@ public static void TestReverseDecoyFromTarget() PeptideWithSetModifications p_tryp = new PeptideWithSetModifications(new Protein("VTIRTVR", "DECOY_TRYP"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, VTIRTVR_modsDictionary, 0, null); PeptideWithSetModifications p_tryp_reverse = p_tryp.GetReverseDecoyFromTarget(newAminoAcidPositions); // Hash code corresponding to the target sequence, should be PairedTargetDecoyHash for reverse - int testMirrorTargetHash = p_tryp.GetHashCode(); + string mirrorTarget = p_tryp.FullSequence; // Hash code corresponding to the decoy sequence, should be PairedTargetDecoyHash for target - int testMirrorDecoyHash = p_tryp_reverse.GetHashCode(); - Assert.AreEqual(testMirrorTargetHash, p_tryp_reverse.PairedTargetDecoySequence.GetHashCode()); - Assert.AreEqual(testMirrorDecoyHash, p_tryp.PairedTargetDecoySequence.GetHashCode()); + string mirrorDecoy = p_tryp_reverse.FullSequence; + Assert.AreEqual(mirrorTarget, p_tryp_reverse.PairedTargetDecoySequence); + Assert.AreEqual(mirrorDecoy, p_tryp.PairedTargetDecoySequence); Assert.AreEqual("RVTRITV", p_tryp_reverse.BaseSequence); Assert.AreEqual(new int[] { 6, 5, 4, 3, 2, 1, 0 }, newAminoAcidPositions); Assert.IsTrue(p_tryp_reverse.AllModsOneIsNterminus.ContainsKey(1));//n-term acetyl @@ -868,12 +883,11 @@ public static void TestScrambledDecoyFromTarget() PeptideWithSetModifications p = new PeptideWithSetModifications(new Protein("PEPTIDEK", "ACCESSIION"), new DigestionParams(), 1, 8, CleavageSpecificity.Full, null, 0, allmodsoneisnterminus, 0, null); int[] newAminoAcidPositions = new int["PEPTIDEK".Length]; PeptideWithSetModifications testScrambled = p.GetScrambledDecoyFromTarget(newAminoAcidPositions); - // Hash code corresponding to the target sequence, should be PairedTargetDecoyHash for reverse - int testTargetHash = p.GetHashCode(); - // Hash code corresponding to the decoy sequence, should be PairedTargetDecoyHash for target - int testDecoyHash = testScrambled.GetHashCode(); - Assert.AreEqual(testScrambled.PairedTargetDecoySequence.GetHashCode(), testTargetHash); - Assert.AreEqual(p.PairedTargetDecoySequence.GetHashCode(), testDecoyHash); + + string targetSequence = p.FullSequence; + string decoySequence = testScrambled.FullSequence; + Assert.AreEqual(testScrambled.PairedTargetDecoySequence, targetSequence); + Assert.AreEqual(p.PairedTargetDecoySequence, decoySequence); Assert.AreEqual("IDEETPPK", testScrambled.BaseSequence); Assert.AreEqual(new int[] { 4, 5, 6, 1, 3, 0, 2, 7 }, newAminoAcidPositions); // Check n-term acetyl @@ -1181,5 +1195,135 @@ public static void TestPeptideWithSetModsNoParentProtein() Assert.AreEqual('-', last.NextAminoAcid); Assert.AreEqual('-', last.NextResidue); } + + [Test] + public static void TestPeptideWithSetModsEquals() + { + // Create two proteins + Protein protein1 = new Protein("SEQUENCEK", "accession1"); + Protein protein2 = new Protein("SEQUENCEK", "accession2"); + + // Create digestion parameters + DigestionParams digestionParams = new DigestionParams(protease: "trypsin", maxMissedCleavages: 0, initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + + // Digest the proteins to get peptides + PeptideWithSetModifications peptide1 = protein1.Digest(digestionParams, new List(), new List()).First(); + PeptideWithSetModifications peptide2 = protein2.Digest(digestionParams, new List(), new List()).First(); + + // Test equality - same peptide + Assert.IsTrue(peptide1.Equals(peptide1)); + + // different peptide + Assert.IsTrue(!peptide1.Equals(peptide2)); + Assert.IsTrue(!peptide1.Equals((object)peptide2)); + Assert.IsTrue(!peptide1.Equals((IBioPolymerWithSetMods)peptide2)); + Assert.AreNotEqual(peptide1.GetHashCode(), peptide2.GetHashCode()); + + // Test inequality with different start residue + PeptideWithSetModifications peptide3 = new PeptideWithSetModifications(protein1, digestionParams, 2, 9, CleavageSpecificity.Full, "", 0, new Dictionary(), 0); + Assert.IsFalse(peptide1.Equals(peptide3)); + + // Test inequality with different parent accession + PeptideWithSetModifications peptide4 = new PeptideWithSetModifications(protein2, digestionParams, 1, 9, CleavageSpecificity.Full, "", 0, new Dictionary(), 0); + Assert.IsFalse(peptide1.Equals(peptide4)); + + // all fail on null + Assert.That(!peptide1.Equals(null)); + Assert.That(!peptide1.Equals((object)null)); + Assert.That(!peptide1.Equals((PeptideWithSetModifications)null)); + } + + + + [Test] + public static void TestIBioPolymerWithSetModsModificationFromFullSequence() + { + Dictionary un = new Dictionary(); + var psiModDeserialized = Loaders.LoadPsiMod(Path.Combine(TestContext.CurrentContext.TestDirectory, "PSI-MOD.obo2.xml")); + Dictionary formalChargesDictionary = Loaders.GetFormalChargesDictionary(psiModDeserialized); + List UniProtPtms = Loaders.LoadUniprot(Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist2.txt"), + formalChargesDictionary).ToList(); + List proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "cRAP_databaseGPTMD.xml"), + true, DecoyType.None, UniProtPtms, false, new string[] { "exclude_me" }, out un); + var allKnownModDict = UniProtPtms.ToDictionary(p => p.IdWithMotif, p => p); + var digestionParameters = new DigestionParams(maxModsForPeptides: 3); + + foreach (Protein p in proteins) + { + List digestedPeptides = + p.Digest(digestionParameters, [], [], null, null).ToList(); + // take the most modified peptide by base sequence and ensure all methods function properly + foreach (var targetPeptide in digestedPeptides + .Where(pep => pep.FullSequence.Contains('[')) + .GroupBy(pep => pep.BaseSequence) + .Select(pepGroup => pepGroup.MaxBy(pep => pep.AllModsOneIsNterminus.Count))) + { + var startResidue = targetPeptide.OneBasedStartResidue; + var endResidue = targetPeptide.OneBasedEndResidue; + + // Pull our expected modifications based upon parent protein object with a maximum value of DigestionParameters.MaxMods + // A bunch of logic to count the number of expected modifications based upon the xml database entries + int expectedModCount = 0; + foreach (var modDictEntry in p.OneBasedPossibleLocalizedModifications + .Where(mod => mod.Key >= startResidue && mod.Key <= endResidue)) + { + if (modDictEntry.Value.Count > 1) + { + var locRestrictions = modDictEntry.Value.Select(mod => mod.LocationRestriction).ToList(); + + if (locRestrictions.AllSame()) + { + if (locRestrictions.First() == "Anywhere.") + expectedModCount++; + else if (locRestrictions.First() == "N-terminal." && modDictEntry.Key == startResidue) + expectedModCount++; + } + else if (modDictEntry.Value.Select(mod => mod.LocationRestriction).Contains("Anywhere.") + && modDictEntry.Value.Select(mod => mod.LocationRestriction) + .Contains("N-terminal.")) + { + expectedModCount++; + if (modDictEntry.Key == startResidue) + expectedModCount++; + } + } + else + { + switch (modDictEntry.Value.First().LocationRestriction) + { + case "Anywhere.": + case "N-terminal." when modDictEntry.Key == startResidue: + expectedModCount++; + break; + } + } + } + + expectedModCount = Math.Min(expectedModCount, digestionParameters.MaxMods); + + var expectedModifications = p.OneBasedPossibleLocalizedModifications.Where(mod => + mod.Key >= startResidue && + mod.Key <= endResidue).SelectMany(mod => mod.Value).ToList(); + + // Parse modifications from PWSM and two IBioPolymerWithSetMods methods + var pwsmModDict = targetPeptide.AllModsOneIsNterminus; + var bpwsmModDict = IBioPolymerWithSetMods.GetModificationDictionaryFromFullSequence(targetPeptide.FullSequence, allKnownModDict); + var bpwsmModList = IBioPolymerWithSetMods.GetModificationsFromFullSequence(targetPeptide.FullSequence, allKnownModDict); + + // Ensure all methods are in agreement by modification count + Assert.AreEqual(pwsmModDict.Count, expectedModCount); + Assert.AreEqual(bpwsmModDict.Count, expectedModCount); + Assert.AreEqual(bpwsmModList.Count, expectedModCount); + + // Ensure all methods are in agreement by modification identify + foreach (var pwsmModification in pwsmModDict.Values) + Assert.Contains(pwsmModification, expectedModifications); + foreach (var pwsmModification in bpwsmModDict.Values) + Assert.Contains(pwsmModification, expectedModifications); + foreach (var pwsmModification in bpwsmModList) + Assert.Contains(pwsmModification, expectedModifications); + } + } + } } } \ No newline at end of file diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index 02cc3aed5..bd8b3f36b 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -224,39 +224,98 @@ public static void TestPeptideWithSetModifications() Assert.AreEqual("[H]M[H][H]", ye.Last().SequenceWithChemicalFormulas); double m1 = 5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass; - m1 = Math.Round(m1, 9, MidpointRounding.AwayFromZero); - double m2 = ye.Last().MonoisotopicMass; double m3 = m1 - m2; - Assert.IsTrue(m3 < 1e-9); } [Test] - public static void TestPeptideWithFixedModifications() + public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepMods() { var prot = new Protein("M", null); DigestionParams digestionParams = new DigestionParams(maxMissedCleavages: 0, minPeptideLength: 1, maxModsForPeptides: 3); // if you pass Custom Protease7 this test gets really flakey. List fixedMods = new List(); ModificationMotif.TryGetMotif("M", out ModificationMotif motif); - fixedMods.Add(new Modification(_originalId: "ProtNmod", _target: motif, _locationRestriction: "N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); fixedMods.Add(new Modification(_originalId: "pepNmod", _target: motif, _locationRestriction: "Peptide N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); fixedMods.Add(new Modification(_originalId: "resMod", _target: motif, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - fixedMods.Add(new Modification(_originalId: "PepCmod", _target: motif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); fixedMods.Add(new Modification(_originalId: "ProtCmod", _target: motif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); - + fixedMods.Add(new Modification(_originalId: "PepCmod", _target: motif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); var ok = prot.Digest(digestionParams, fixedMods, new List()).ToList(); Assert.AreEqual(1, ok.Count); - Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence); + Assert.AreEqual("[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]", ok.First().FullSequence); Assert.AreEqual("[H]M[H][H]", ok.First().SequenceWithChemicalFormulas); Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9); } + [Test] + public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepMods_RandomizedModOrder() + { + var rand = new Random(42); + var prot = new Protein("M", null); + DigestionParams digestionParams = new DigestionParams(maxMissedCleavages: 0, minPeptideLength: 1, maxModsForPeptides: 3); // if you pass Custom Protease7 this test gets really flakey. + List fixedMods = new List(); + ModificationMotif.TryGetMotif("M", out ModificationMotif motif); + fixedMods.Add(new Modification(_originalId: "ProtNmod", _target: motif, _locationRestriction: "N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "pepNmod", _target: motif, _locationRestriction: "Peptide N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "resMod", _target: motif, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "ProtCmod", _target: motif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "PepCmod", _target: motif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + + // set expected values + int expectedDigestionProducts = 1; + string expectedFullSequence = "[:ProtNmod on M]M[:resMod on M][:ProtCmod on M]"; + string expectedSequenceWithChemicalFormulas = "[H]M[H][H]"; + double expectedMonoisotopicMass = 5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass; + + // randomly scramble all mods, digest, and ensure the answer is correct. + for (int i = 0; i < 10; i++) + { + var shuffledFixedMods = fixedMods.OrderBy(a => rand.Next()).ToList(); + var ok = prot.Digest(digestionParams, shuffledFixedMods, new List()).ToList(); + + Assert.AreEqual(expectedDigestionProducts, ok.Count); + Assert.AreEqual(expectedFullSequence, ok.First().FullSequence); + Assert.AreEqual(expectedSequenceWithChemicalFormulas, ok.First().SequenceWithChemicalFormulas); + Assert.AreEqual(expectedMonoisotopicMass, ok.Last().MonoisotopicMass, 1e-9); + } + } + + [Test] + public static void TestPeptideDigestion_FixedModifications_ProtModsOverwritePepMods_TwoProducts() + { + var prot = new Protein("MKM", null); + DigestionParams digestionParams = new DigestionParams(maxMissedCleavages: 0, minPeptideLength: 1, maxModsForPeptides: 3, initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + List fixedMods = new List(); + ModificationMotif.TryGetMotif("M", out ModificationMotif mMotif); + ModificationMotif.TryGetMotif("K", out ModificationMotif kMotif); + + fixedMods.Add(new Modification(_originalId: "ProtNmod", _target: mMotif, _locationRestriction: "N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "ProtNmod", _target: kMotif, _locationRestriction: "N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "pepNmod", _target: mMotif, _locationRestriction: "Peptide N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "pepNmod", _target: kMotif, _locationRestriction: "Peptide N-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "resMod", _target: mMotif, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "ProtCmod", _target: mMotif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "ProtCmod", _target: kMotif, _locationRestriction: "C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "PepCmod", _target: mMotif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + fixedMods.Add(new Modification(_originalId: "PepCmod", _target: kMotif, _locationRestriction: "Peptide C-terminal.", _chemicalFormula: ChemicalFormula.ParseFormula("H"), _monoisotopicMass: GetElement(1).PrincipalIsotope.AtomicMass)); + + var ok = prot.Digest(digestionParams, fixedMods, new List()).ToList(); + + Assert.AreEqual(2, ok.Count); + + Assert.AreEqual("[:ProtNmod on M]M[:resMod on M]K[:PepCmod on K]", ok.First().FullSequence); + Assert.AreEqual("[:pepNmod on M]M[:resMod on M][:ProtCmod on M]", ok.Skip(1).First().FullSequence); + + Assert.AreEqual("[H]M[H]K[H]", ok.First().SequenceWithChemicalFormulas); + Assert.AreEqual("[H]M[H][H]", ok.Skip(1).First().SequenceWithChemicalFormulas); + Assert.AreEqual(5 * GetElement("H").PrincipalIsotope.AtomicMass + Residue.ResidueMonoisotopicMass['M'] + GetElement("O").PrincipalIsotope.AtomicMass, ok.Last().MonoisotopicMass, 1e-9); + } + [Test] public static void TestDigestIndices() { @@ -361,6 +420,40 @@ public static void Test_ProteinDigest() Assert.AreEqual("MED[mt:mod1 on D]EEK", pep2.FullSequence); } + /// + /// We want to have protein digestion yield the same set of peptides regardless of the order their modifications are encoded in the XML. + /// While all of the positions of the modifications are the same, the order of the modifications in the XML is different. + /// + [Test] + public static void TestDigestionOfSameProteinFromDifferentXmls() + { + DigestionParams digestionParams = new DigestionParams("trypsin", maxMissedCleavages: 2, minPeptideLength: 7, initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + ModificationMotif.TryGetMotif("C", out ModificationMotif motif); + Modification carbamidomethylOnC = new Modification(_originalId: "Carbamidomethyl on C", _modificationType: "Common Fixed", _target: motif, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3NO")); + var fixedModifications = new List { carbamidomethylOnC }; + ModificationMotif.TryGetMotif("M", out ModificationMotif motifM); + Modification oxidationOnM = new Modification(_originalId: "Oxidation on M", _modificationType: "Common Variable", _target: motif, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("O")); + var variableModifications = new List { oxidationOnM }; + + // Load in proteins + var dbFive = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SingleEntry_ModOrder1.xml"); + var dbSix = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SingleEntry_ModOrder2.xml"); + + var proteins5 = ProteinDbLoader.LoadProteinXML(dbFive, true, DecoyType.None, null, false, null, out var unknownModificationsFive); + var proteins6 = ProteinDbLoader.LoadProteinXML(dbSix, true, DecoyType.None, null, false, null, out var unknownModificationsSix); + + var fiveMods = ProteinDbLoader.GetPtmListFromProteinXml(dbFive); + var sixMods = ProteinDbLoader.GetPtmListFromProteinXml(dbSix); + + Assert.AreEqual(fiveMods.Count, sixMods.Count); + CollectionAssert.AreEquivalent(fiveMods, sixMods); + + var peptides5 = proteins5.First().Digest(digestionParams, fixedModifications, variableModifications).ToList(); + var peptides6 = proteins6.First().Digest(digestionParams, fixedModifications, variableModifications).ToList(); + Assert.AreEqual(peptides5.Count, peptides6.Count); + CollectionAssert.AreEqual(peptides5, peptides6); + } + [Test] [TestCase("cRAP_databaseGPTMD.xml")] [TestCase("uniprot_aifm1.fasta")] diff --git a/mzLib/Test/TestProteinProperties.cs b/mzLib/Test/TestProteinProperties.cs index 6c8d866dd..ce669e59a 100644 --- a/mzLib/Test/TestProteinProperties.cs +++ b/mzLib/Test/TestProteinProperties.cs @@ -5,8 +5,10 @@ using System.Collections.Generic; using System.IO; using System.Linq; +using Omics; using Omics.Modifications; using Stopwatch = System.Diagnostics.Stopwatch; +using Transcriptomics; namespace Test { @@ -261,5 +263,43 @@ public static void TestProteoformClassification()//string inputPath) ///Test case 3 is 2B (not level 3) because you've localized the mod, you just aren't sure what mod it is. ///In test case 1, you know what the mods are, but you're not sure where they belong. } + + [Test] + public void TestProteinEquals() + { + string sequence = "MKWVTFISLLFLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNEVTEFAKTCVADESAENCDKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLFFAKRYKAAFTECCQAADKAACLLPKLDELRDEGKASSAKQR"; + string accession = "P02768"; + Protein protein1 = new Protein(sequence, accession); + Protein protein2 = new Protein(sequence, accession); + + NUnit.Framework.Assert.That(protein1.Equals(protein2), Is.True); + NUnit.Framework.Assert.That(protein1.Equals((object)protein2), Is.True); + NUnit.Framework.Assert.That(protein1.Equals(null), Is.False); + } + + [Test] + public void TestProteinGetHashCode() + { + string sequence = "MKWVTFISLLFLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNEVTEFAKTCVADESAENCDKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLFFAKRYKAAFTECCQAADKAACLLPKLDELRDEGKASSAKQR"; + string accession = "P02768"; + Protein protein = new Protein(sequence, accession); + + NUnit.Framework.Assert.That(protein.GetHashCode(), Is.EqualTo(sequence.GetHashCode())); + } + + [Test] + public void TestProteinRnaEquality() + { + string sequence = "MKWVTFISLLFLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGLVLIAFSQYLQQCPFDEHVKLVNEVTEFAKTCVADESAENCDKSLHTLFGDELCKVASLRETYGDMADCCEKQEPERNECFLSHKDDSPDLPKLKPDPNTLCDEFKADEKKFWGKYLYEIARRHPYFYAPELLFFAKRYKAAFTECCQAADKAACLLPKLDELRDEGKASSAKQR"; + string accession = "P02768"; + Protein protein1 = new Protein(sequence, accession); + RNA rna = new RNA("GUACUG"); + + + Assert.That(!rna.Equals(protein1)); + Assert.That(!protein1.Equals(rna)); + Assert.That(!((IBioPolymer)rna).Equals(protein1)); + Assert.That(!((IBioPolymer)protein1).Equals(rna)); + } } } \ No newline at end of file diff --git a/mzLib/Test/TestSpectra.cs b/mzLib/Test/TestSpectra.cs index e33d9adb1..22c4b3224 100644 --- a/mzLib/Test/TestSpectra.cs +++ b/mzLib/Test/TestSpectra.cs @@ -22,6 +22,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Chemistry; using Assert = NUnit.Framework.Legacy.ClassicAssert; using Stopwatch = System.Diagnostics.Stopwatch; @@ -342,5 +343,152 @@ public void TestEqualsAndHashCode() Assert.That(!_mzSpectrumA.Equals(2)); Assert.That(!_mzSpectrumA.Equals((object)2)); } + + #region Neutral Mass Spectrum + + [Test] + public void NeutralMassSpectrum_Constructor_ValidArguments_InitializesProperties() + { + double[] monoisotopicMasses = { 100.0, 200.0, 300.0 }; + double[] intensities = { 0.5, 0.8, 1.0 }; + int[] charges = { 1, 2, 3 }; + + var spectrum = new NeutralMassSpectrum(monoisotopicMasses, intensities, charges, true); + + Assert.That(monoisotopicMasses.Length, Is.EqualTo(spectrum.XArray.Length)); + Assert.That(intensities.Length, Is.EqualTo(spectrum.YArray.Length)); + Assert.That(charges.Length, Is.EqualTo(spectrum.Charges.Length)); + } + + [Test] + public void NeutralMassSpectrum_Constructor_InvalidArguments_ThrowsArgumentException() + { + double[] monoisotopicMasses = { 100.0, 200.0, 300.0 }; + double[] intensities = { 0.5, 0.8 }; + int[] charges = { 1, 2, 3 }; + bool shouldCopy = true; + + Assert.Throws(() => new NeutralMassSpectrum(monoisotopicMasses, intensities, charges, shouldCopy)); + } + + [Test] + public void NeutralMassSpectrum_MzPeak() + { + double[] monoisotopicMasses = { 100.0, 200.0, 300.0 }; + double[] intensities = { 0.5, 0.8, 1.0 }; + int[] charges = { 1, 2, 3 }; + var spectrum = new NeutralMassSpectrum(monoisotopicMasses, intensities, charges, true); + + + var peak = spectrum.Extract(50, 210).ToArray(); + Assert.That(peak.Length, Is.EqualTo(2)); + + for (int i = 0; i < peak.Length; i++) + { + double mono = monoisotopicMasses[i]; + int charge = charges[i]; + double intensity = intensities[i]; + double mz = mono.ToMz(charge); + + Assert.That(peak[i].Mz, Is.EqualTo(mz)); + Assert.That(peak[i].Intensity, Is.EqualTo(intensity)); + } + } + + [Test] + public void NeutralMassSpectrum_MzRange() + { + double[] monoisotopicMasses = { 100.0, 200.0, 300.0 }; + double[] intensities = { 0.5, 0.8, 1.0 }; + int[] charges = { 1, 2, 3 }; + var spectrum = new NeutralMassSpectrum(monoisotopicMasses, intensities, charges, true); + + + var peak = spectrum.Extract(50, 2100).ToArray(); + Assert.That(peak.Length, Is.EqualTo(3)); + var minPeak = peak.MinBy(p => p.Mz); + var maxPeak = peak.MaxBy(p => p.Mz); + + Assert.That(minPeak.Mz, Is.EqualTo(spectrum.Range.Minimum)); + Assert.That(minPeak.Mz, Is.EqualTo(spectrum.FirstX)); + Assert.That(maxPeak.Mz, Is.EqualTo(spectrum.Range.Maximum)); + Assert.That(maxPeak.Mz, Is.EqualTo(spectrum.LastX)); + + for (int i = 0; i < peak.Length; i++) + { + double mono = monoisotopicMasses[i]; + int charge = charges[i]; + double intensity = intensities[i]; + double mz = mono.ToMz(charge); + + Assert.That(peak[i].Mz, Is.EqualTo(mz)); + Assert.That(peak[i].Intensity, Is.EqualTo(intensity)); + } + } + + [Test] + public void NeutralMassSpectrum_Constructor_ValidArguments_InitializesCharges() + { + // Arrange + double[,] monoisotopicMassesIntensities = new double[,] { { 100.0, 200.0 }, { 300.0, 400.0 } }; + int[] charges = new int[] { 1, 2 }; + + // Act + var spectrum = new NeutralMassSpectrum(monoisotopicMassesIntensities, charges); + + // Assert + Assert.AreEqual(charges, spectrum.Charges); + } + + [Test] + public void NeutralMassSpectrum_Constructor2_InvalidArguments_ThrowsArgumentException() + { + // Arrange + double[,] monoisotopicMassesIntensities = new double[,] { { 100.0, 200.0 }, { 300.0, 400.0 } }; + int[] charges = new int[] { 1, 2, 3 }; + + // Act & Assert + Assert.Throws(() => new NeutralMassSpectrum(monoisotopicMassesIntensities, charges)); + } + + #endregion + + [TestCase(50.0, new int[] { })] // Case: Nearest index is zero + [TestCase(101.0, new int[] { 1 })] // Case: Nearest index is within tolerance + [TestCase(102.0, new int[] { 1, 2 })] // Case: Upper and lower bounds within tolerance + [TestCase(104.0, new int[] { 3, 4, 2 })] // Case: Upper and lower bounds within tolerance + [TestCase(105.0, new int[] { 4, 5, 3 })] // Case: Upper and lower bounds within tolerance + [TestCase(106.0, new int[] { 5, 4 })] // Case: Upper and lower bounds with stopping conditions + [TestCase(107.0, new int[] { 5 })] // Case: Upper outside tolerance + [TestCase(200.0, new int[] { })] // Case: Nearest index is outside range + public void TestGetPeakIndicesWithinTolerance(double x, int[] expectedIndices) + { + // Arrange + var xArray = new [] { 99.0, 101.0, 103.0, 104.0, 105.0, 106.0 }; + var tolerance = new AbsoluteTolerance(1); + var testObject = new MzSpectrum(xArray, xArray, false); + + // Act + List result = testObject.GetPeakIndicesWithinTolerance(x, tolerance); + + // Assert + NUnit.Framework.Assert.That(result, Is.EquivalentTo(expectedIndices)); + } + + [Test] + public void TestGetPeakIndicesWithinTolerance_HandlesEmptyXArray_Gracefully() + { + // Arrange + var empty = new double[] { }; // Empty array + var tolerance = new PpmTolerance(10); + var testObject = new MzSpectrum(empty, empty, false); + double x = 103.0; + + // Act + List result = testObject.GetPeakIndicesWithinTolerance(x, tolerance); + + // Assert + NUnit.Framework.Assert.That(result, Is.Empty); + } } } \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.fasta b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta new file mode 100644 index 000000000..c222589c1 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta @@ -0,0 +1,2 @@ +>id:2|Name:20mer1|SOterm:20mer1|Type:tRNA|Subtype:Ala|Feature:VGC|Cellular_Localization:freezer|Species:standard +GUACUGCCUCUAGUGAAGCA \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz new file mode 100644 index 000000000..2fe54f9ab Binary files /dev/null and b/mzLib/Test/Transcriptomics/TestData/20mer1.fasta.gz differ diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.xml b/mzLib/Test/Transcriptomics/TestData/20mer1.xml new file mode 100644 index 000000000..6f17d6f3d --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestData/20mer1.xml @@ -0,0 +1,17 @@ + + + + 20mer1 + 20mer1 + + + 20mer1 + + + + + standard + + GUACUGCCUCUAGUGAAGCA + + \ No newline at end of file diff --git a/mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz b/mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz new file mode 100644 index 000000000..19dac16bf Binary files /dev/null and b/mzLib/Test/Transcriptomics/TestData/20mer1.xml.gz differ diff --git a/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta new file mode 100644 index 000000000..18802a82a --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta @@ -0,0 +1,10 @@ +>id:1|Name:tdbR00000010|SOterm:SO:0000254|Type:tRNA|Subtype:Ala|Feature:VGC|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA +>id:2|Name:tdbR00000008|SOterm:SO:0000254|Type:tRNA|Subtype:Ala|Feature:GGC|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GGGGCUAUAGCUCAGCUGGGAGAGCGCUUGCAUGGCAUGCAAGAGGUCAGCGGUUCGAUCCCGCUUAGCUCCACCA +>id:3|Name:tdbR00000356|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:ICG|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCAUCCGUAGCUCAGCUGGAUAGAGUACUCGGCUACGAACCGAGCGGUCGGAGGUUCGAAUCCUCCCGGAUGCACCA +>id:4|Name:tdbR00000359|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:CCG|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCGCCCGUAGCUCAGCUGGAUAGAGCGCUGCCCUCCGGAGGCAGAGGUCUCAGGUUCGAAUCCUGUCGGGCGCGCCA +>id:5|Name:tdbR00000358|SOterm:SO:0001036|Type:tRNA|Subtype:Arg|Feature:UCU|Cellular_Localization:prokaryotic cytosol|Species:Escherichia coli +GCGCCCUUAGCUCAGUUGGAUAGAGCAACGACCUUCUAAGUCGUGGGCCGCAGGUUCGAAUCCUGCAGGGCGCGCCA diff --git a/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz new file mode 100644 index 000000000..11ab87ef2 Binary files /dev/null and b/mzLib/Test/Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta.gz differ diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs new file mode 100644 index 000000000..e1ef6af90 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs @@ -0,0 +1,171 @@ +using NUnit.Framework; +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using UsefulProteomicsDatabases.Transcriptomics; +using UsefulProteomicsDatabases; +using Transcriptomics; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestDbLoader + { + public static string ModomicsUnmodifedFastaPath => Path.Combine(TestContext.CurrentContext.TestDirectory, + "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta"); + + /// + /// Detect the headertype of the test cases + /// + private static IEnumerable<(string, RnaFastaHeaderType)> DetectHeaderTestCases => + new List<(string, RnaFastaHeaderType)> + { + (Path.Combine(TestContext.CurrentContext.TestDirectory, "DoubleProtease.tsv"), RnaFastaHeaderType.Unknown), + (ModomicsUnmodifedFastaPath, RnaFastaHeaderType.Modomics), + (Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.fasta"), RnaFastaHeaderType.Modomics), + + }; + + /// + /// Test the correctness of checking headertype + /// + /// + [Test] + [TestCaseSource(nameof(DetectHeaderTestCases))] + public static void TestDetectHeaderType((string dbPath, RnaFastaHeaderType headerType) testData) + { + string line = File.ReadLines(testData.dbPath).First(); + if (char.IsDigit(line.First())) + { + line = File.ReadLines(testData.dbPath).Skip(1).First(); + } + var type = RnaDbLoader.DetectRnaFastaHeaderType(line); + Assert.That(testData.headerType, Is.EqualTo(type)); + } + + + [Test] + [TestCase("ModomicsUnmodifiedTrimmed.fasta")] + [TestCase("ModomicsUnmodifiedTrimmed.fasta.gz")] + public static void TestModomicsUnmodifiedFasta(string databaseFileName) + { + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", + databaseFileName); + var oligos = RnaDbLoader.LoadRnaFasta(dbPath, true, DecoyType.None, false, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(5)); + Assert.That(oligos.First().BaseSequence, + Is.EqualTo("GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA")); + Assert.That(oligos.First().Name, Is.EqualTo("tdbR00000010")); + Assert.That(oligos.First().Accession, Is.EqualTo("SO:0000254")); + Assert.That(oligos.First().Organism, Is.EqualTo("Escherichia coli")); + Assert.That(oligos.First().DatabaseFilePath, Is.EqualTo(dbPath)); + Assert.That(oligos.First().IsContaminant, Is.False); + Assert.That(oligos.First().IsDecoy, Is.False); + Assert.That(oligos.First().AdditionalDatabaseFields!.Count, Is.EqualTo(5)); + Assert.That(oligos.First().AdditionalDatabaseFields!["Id"], Is.EqualTo("1")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Type"], Is.EqualTo("tRNA")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Subtype"], Is.EqualTo("Ala")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Feature"], Is.EqualTo("VGC")); + Assert.That(oligos.First().AdditionalDatabaseFields!["Cellular Localization"], Is.EqualTo("prokaryotic cytosol")); + } + + [Test] + public static void TestContaminantFollowsThrough() + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, true, DecoyType.None, true, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(5)); + Assert.That(oligos.First().BaseSequence, + Is.EqualTo("GGGGCUAUAGCUCAGCUGGGAGAGCGCCUGCUUUGCACGCAGGAGGUCUGCGGUUCGAUCCCGCAUAGCUCCACCA")); + Assert.That(oligos.All(p => p.IsContaminant)); + Assert.That(oligos.All(p => !p.IsDecoy)); + } + + [Test] + public static void TestNotGeneratingTargetsOrDecoys() + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, false, DecoyType.None, true, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(0)); + } + + [Test] + public static void TestXmlWriterReader() + { + var rna = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, true, DecoyType.None, false, out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + + var modString = "ID Methylation\r\nMT Biological\r\nPP Anywhere.\r\nTG G\r\nCF C1H2\r\n" + @"//"; + var methylG = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).First(); + + Dictionary>> mods = new Dictionary>>(); + mods.Add("SO:0000254", new HashSet>() + { + new Tuple(1, methylG), + new Tuple(3, methylG) + }); + + string outpath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics/TestData/ModomicsUnmodifiedTrimmed.xml"); + + var xml = ProteinDbWriter.WriteXmlDatabase(mods, rna, outpath); + var temp = RnaDbLoader.LoadRnaXML(outpath, true, DecoyType.None, false, + new List() { methylG }, new List(), out var unknownMods); + + Assert.That(unknownMods.Count, Is.EqualTo(0)); + Assert.That(temp.Count, Is.EqualTo(5)); + var first = temp.First(); + var loadedMods = first.OneBasedPossibleLocalizedModifications; + Assert.That(loadedMods.Count, Is.EqualTo(2)); + Assert.That(loadedMods[1].Count, Is.EqualTo(1)); + Assert.That(loadedMods[3].Count, Is.EqualTo(1)); + Assert.That(loadedMods[1].First().IdWithMotif, Is.EqualTo(methylG.IdWithMotif)); + Assert.That(loadedMods[3].First().IdWithMotif, Is.EqualTo(methylG.IdWithMotif)); + } + + [Test] + [TestCase("ATCG", "AUCG", true)] + [TestCase("ATCG", "UAGC", false)] + [TestCase("ATCGZ", "AUCGZ", true)] + [TestCase("ATCGZ", "UAGCZ", false)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACT", "AUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACU", true)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACT", "UAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGA", false)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACT", "AUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACUAUCGACGAAUCACGAUCAGUCAUGCAUUGCUAACU", true)] + [TestCase("ATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACTATCGACGAATCACGATCAGTCATGCATTGCTAACT", "UAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGAUAGCUGCUUAGUGCUAGUCAGUACGUAACGAUUGA", false)] + public static void TestTranscribe(string input, string expected, bool isCodingStrand) + { + Assert.That(input.Transcribe(isCodingStrand), Is.EqualTo(expected)); + } + + [Test] + [TestCase("20mer1.fasta")] + [TestCase("20mer1.fasta.gz")] + [TestCase("20mer1.xml")] + [TestCase("20mer1.xml.gz")] + public static void TestDbReadingDifferentExtensions(string databaseFileName) + { + var dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", + databaseFileName); + + List rna; + if (dbPath.Contains("fasta")) + rna = RnaDbLoader.LoadRnaFasta(dbPath, true, DecoyType.None, false, + out var errors); + else + rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.None, false, + new List(), new List(), out _); + + Assert.That(rna.Count, Is.EqualTo(1)); + Assert.That(rna.First().BaseSequence, Is.EqualTo("GUACUGCCUCUAGUGAAGCA")); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs new file mode 100644 index 000000000..acf6cbff9 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDecoyGenerator.cs @@ -0,0 +1,280 @@ +using NUnit.Framework; +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework.Interfaces; +using Transcriptomics; +using Transcriptomics.Digestion; +using UsefulProteomicsDatabases.Transcriptomics; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + internal class TestDecoyGeneration + { + public static string ModomicsUnmodifiedFastaPath => TestDbLoader.ModomicsUnmodifedFastaPath; + + [Test] + public static void TestReverseDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUUCUG"), + new RNA("GUGCUA"), + }; + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Reverse, 1); + Assert.That(decoys.Count, Is.EqualTo(2)); + Assert.That(decoys[0].BaseSequence, Is.EqualTo("UCUUGG")); + Assert.That(decoys[1].BaseSequence, Is.EqualTo("UCGUGA")); + + var example = oligos.First(); + Assert.That(decoys.All(p => !p.IsContaminant)); + Assert.That(decoys.All(p => p.IsDecoy)); + Assert.That(decoys.All(p => p.DatabaseFilePath == example.DatabaseFilePath)); + Assert.That(decoys.All(p => p.Organism == example.Organism)); + Assert.That(decoys.All(p => p.AdditionalDatabaseFields == example.AdditionalDatabaseFields)); + Assert.That(decoys.All(p => p.Accession == example.Accession)); + Assert.That(decoys.All(p => p.Name == example.Name)); + Assert.That(decoys.All(p => p.Length == example.Length)); + Assert.That(decoys.All(p => Equals(p.FivePrimeTerminus, example.FivePrimeTerminus))); + Assert.That(decoys.All(p => Equals(p.ThreePrimeTerminus, example.ThreePrimeTerminus))); + Assert.That(decoys.All(p => p.OneBasedPossibleLocalizedModifications.Count == example.OneBasedPossibleLocalizedModifications.Count)); + } + + [Test] + [TestCase("GUACUG", 1, "UCAUGG", 5)] + [TestCase("GUACUA", 2, "UCAUGA", 4)] + [TestCase("GUACUA", 3, "UCAUGA", 3)] + [TestCase("GUACUA", 4, "UCAUGA", 2)] + [TestCase("GUCCAA", 5, "ACCUGA", 1)] + [TestCase("GUUCUA", 6, "UCUUGA", 6)] + public static void TestReverseDecoy_SimpleWithMods(string rnaSequence, int modPosition, string expectedDecoySequence, int expectedDecoyModPosition) + { + var mod = new Modification(); + var oligos = new List() + { + new RNA(rnaSequence, null, null, + new Dictionary>() + { { modPosition, new List() { mod } } }), + }; + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Reverse, 1); + Assert.That(decoys.Count, Is.EqualTo(1)); + + var decoy = decoys.First(); + var originalRna = oligos.First(); + Assert.That(decoy.BaseSequence, Is.EqualTo(expectedDecoySequence)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(1)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Key, Is.EqualTo(expectedDecoyModPosition)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Value.Count, Is.EqualTo(1)); + Assert.That(decoy.OneBasedPossibleLocalizedModifications.First().Value.First(), Is.EqualTo(mod)); + Assert.That(decoy.Name, Is.EqualTo(originalRna.Name)); + Assert.That(decoy.Accession, Is.EqualTo(originalRna.Accession)); + Assert.That(decoy.Organism, Is.EqualTo(originalRna.Organism)); + Assert.That(decoy.DatabaseFilePath, Is.EqualTo(originalRna.DatabaseFilePath)); + Assert.That(decoy.IsContaminant, Is.EqualTo(originalRna.IsContaminant)); + Assert.That(decoy.IsDecoy, Is.True); + Assert.That(decoy.AdditionalDatabaseFields, Is.EqualTo(originalRna.AdditionalDatabaseFields)); + Assert.That(decoy.FivePrimeTerminus, Is.EqualTo(originalRna.FivePrimeTerminus)); + Assert.That(decoy.ThreePrimeTerminus, Is.EqualTo(originalRna.ThreePrimeTerminus)); + } + + [Test] + public void TestReverseDecoy_FromDatabase() + { + int numSequences = 5; + Dictionary expectedSequences = new Dictionary() + { + { "tdbR00000010", "CCACCUCGAUACGCCCUAGCUUGGCGUCUGGAGGACGCACGUUUCGUCCGCGAGAGGGUCGACUCGAUAUCGGGGA"}, + { "tdbR00000008", "CCACCUCGAUUCGCCCUAGCUUGGCGACUGGAGAACGUACGGUACGUUCGCGAGAGGGUCGACUCGAUAUCGGGGA"}, + { "tdbR00000356", "CCACGUAGGCCCUCCUAAGCUUGGAGGCUGGCGAGCCAAGCAUCGGCUCAUGAGAUAGGUCGACUCGAUGCCUACGA"}, + { "tdbR00000359", "CCGCGCGGGCUGUCCUAAGCUUGGACUCUGGAGACGGAGGCCUCCCGUCGCGAGAUAGGUCGACUCGAUGCCCGCGA"}, + { "tdbR00000358", "CCGCGCGGGACGUCCUAAGCUUGGACGCCGGGUGCUGAAUCUUCCAGCAACGAGAUAGGUUGACUCGAUUCCCGCGA"}, + }; + + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Reverse, false, + out var errors); + Assert.That(errors.Count, Is.EqualTo(0)); + Assert.That(oligos.Count, Is.EqualTo(numSequences * 2)); + Assert.That(oligos.Count(p => p.IsDecoy), Is.EqualTo(numSequences)); + Assert.That(oligos.Count(p => !p.IsDecoy), Is.EqualTo(numSequences)); + + foreach (var targetDecoyGroup in oligos.GroupBy(p => p.Name)) + { + Assert.That(targetDecoyGroup.Count(), Is.EqualTo(2)); + var target = targetDecoyGroup.First(p => !p.IsDecoy); + var decoy = targetDecoyGroup.First(p => p.IsDecoy); + var expectedSequence = expectedSequences[target.Name]; + + Assert.That(target.FivePrimeTerminus, Is.EqualTo(decoy.FivePrimeTerminus)); + Assert.That(target.ThreePrimeTerminus, Is.EqualTo(decoy.ThreePrimeTerminus)); + Assert.That(target.AdditionalDatabaseFields, Is.EqualTo(decoy.AdditionalDatabaseFields)); + Assert.That(target.IsContaminant, Is.EqualTo(decoy.IsContaminant)); + Assert.That(target.DatabaseFilePath, Is.EqualTo(decoy.DatabaseFilePath)); + Assert.That(target.DatabaseFilePath, Is.EqualTo(ModomicsUnmodifiedFastaPath)); + Assert.That(target.Organism, Is.EqualTo(decoy.Organism)); + Assert.That(target.Accession, Is.EqualTo(decoy.Accession)); + Assert.That(target.Name, Is.EqualTo(decoy.Name)); + Assert.That(target.Length, Is.EqualTo(decoy.Length)); + Assert.That(target.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(decoy.OneBasedPossibleLocalizedModifications.Count)); + + Assert.That(decoy.BaseSequence, Is.EqualTo(expectedSequence)); + } + } + + + // TODO: Implement these test once other decoy generation methods are availiable + + [Test] + public void TestShuffledDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + }); + + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestShuffledDecoy_SimpleWithMods() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + }); + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Shuffle); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestShuffledDecoy_FromDatabase() + { + Assert.Throws(() => + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + }); + + //var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + //Assert.That(errors.Count, Is.EqualTo(0)); + //Assert.That(oligos.Count, Is.EqualTo(10)); + } + + [Test] + public void TestSlideDecoy_Simple() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + }); + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestSlideDecoy_SimpleWithMods() + { + var oligos = new List() + { + new RNA("GUACUG"), + new RNA("GUACUA"), + }; + + Assert.Throws(() => + { + var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + }); + + //var decoys = RnaDecoyGenerator.GenerateDecoys(oligos, DecoyType.Slide); + //Assert.That(decoys.Count, Is.EqualTo(2)); + } + + [Test] + public void TestSlideDecoy_FromDatabase() + { + Assert.Throws(() => + { + var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Shuffle, false, out var errors); + }); + + //var oligos = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifiedFastaPath, true, DecoyType.Slide, false, out var errors); + //Assert.That(errors.Count, Is.EqualTo(0)); + //Assert.That(oligos.Count, Is.EqualTo(10)); + } + + + [Test] + public void TestCreateNew() + { + var mods = PtmListLoader.ReadModsFromString( + "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> modsOut).ToList(); + var modDict = mods.ToDictionary(p => p.IdWithMotif, p => p); + var oneBasedPossibleLocalizedModifications = new Dictionary>() + { + { 1, new List() { modDict["Sodium on A"] } }, + { 3, new List() { modDict["Sodium on A"] } }, + }; + + var rna = new RNA("GAACUG", "name", "accession", "organism", "databaseFilePath", + null, null, oneBasedPossibleLocalizedModifications, false, false, new List>(), + new Dictionary()); + var oligos = rna + .Digest(new RnaDigestionParams(maxMods: 1), new List(), mods) + .ToList(); + + var clonedRna = rna.CreateNew(null, null, true); + var clonedOligo = oligos.First().CreateNew(null, null, true); + + // ensure they are identical except for the isDecoy field + Assert.That(rna.BaseSequence, Is.EqualTo(clonedRna.BaseSequence)); + Assert.That(rna.OneBasedPossibleLocalizedModifications, Is.EqualTo(clonedRna.OneBasedPossibleLocalizedModifications)); + Assert.That(rna.IsDecoy, Is.Not.EqualTo(clonedRna.IsDecoy)); + + Assert.That(oligos.First().BaseSequence, Is.EqualTo(clonedOligo.BaseSequence)); + Assert.That(oligos.First().OneBasedPossibleLocalizedModifications, Is.EqualTo(clonedOligo.OneBasedPossibleLocalizedModifications)); + Assert.That(oligos.First().Parent.IsDecoy, Is.Not.EqualTo(clonedOligo.Parent.IsDecoy)); + + + var newMods = new Dictionary>() + { + { 1, new List() { modDict["Sodium on A"] } }, + { 2, new List() { modDict["Sodium on A"] } }, + { 3, new List() { modDict["Sodium on A"] } }, + }; + clonedRna = rna.CreateNew("AAAAAA", newMods, null); + clonedOligo = oligos.First().CreateNew("AAAAAA", newMods, null); + + Assert.That(rna.BaseSequence, Is.Not.EqualTo(clonedRna.BaseSequence)); + Assert.That(rna.OneBasedPossibleLocalizedModifications, Is.Not.EqualTo(clonedRna.OneBasedPossibleLocalizedModifications)); + Assert.That(rna.IsDecoy, Is.EqualTo(clonedRna.IsDecoy)); + + Assert.That(oligos.First().BaseSequence, Is.Not.EqualTo(clonedOligo.BaseSequence)); + Assert.That(oligos.First().OneBasedPossibleLocalizedModifications, Is.Not.EqualTo(clonedOligo.OneBasedPossibleLocalizedModifications)); + Assert.That(oligos.First().Parent.IsDecoy, Is.EqualTo(clonedOligo.Parent.IsDecoy)); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestDigestion.cs b/mzLib/Test/Transcriptomics/TestDigestion.cs new file mode 100644 index 000000000..dc577a6d3 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestDigestion.cs @@ -0,0 +1,1188 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.IO; +using System.Linq; +using Chemistry; +using MassSpectrometry; +using MzLibUtil; +using NUnit.Framework; +using Omics; +using Omics.Digestion; +using Omics.Fragmentation; +using Omics.Modifications; +using Transcriptomics; +using Transcriptomics.Digestion; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestDigestion + { + public record RnaDigestionTestCase(string BaseSequence, string Enzyme, int MissedCleavages, int MinLength, + int MaxLength, int DigestionProductCount, + double[] MonoMasses, string[] Sequences); + + public static IEnumerable GetTestCases() + { + // 6bp Top Down + yield return new RnaDigestionTestCase("GUACUG", "top-down", + 0, 1, 6, 1, + new[] { 1874.28 }, + new[] { "GUACUG" }); + // 6bp Rnase T1, normal + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 1, 6, 2, + new[] { 363.057, 1529.234 }, + new[] { "G", "UACUG" }); + // 6bp Cusativin, normal + yield return new RnaDigestionTestCase("GUACUG", "Cusativin", + 0, 1, 6, 2, + new[] { 1303.175, 589.116 }, + new[] { "GUAC", "UG" }); + // 6bp Rnase T1, one product too short + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 3, 6, 1, + new[] { 1529.234 }, + new[] { "UACUG" }); + // 6bp Rnase T1, one product too long + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 0, 1, 2, 1, + new[] { 363.057 }, + new[] { "G" }); + // 6bp Rnase T1, 1 missed cleavage + yield return new RnaDigestionTestCase("GUACUG", "RNase T1", + 1, 1, 6, 3, + new[] { 363.057, 1529.234, 1874.28 }, + new[] { "G", "UACUG", "GUACUG" }); + // 6bp Rnase A + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 0, 1, 6, 4, + new[] { 669.082, 652.103, 324.035, 283.091 }, + new[] { "GU", "AC", "U", "G" }); + // 6bp Rnase A, 1 missed cleavage + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 1, 1, 6, 7, + new[] { 669.082, 652.103, 324.035, 283.091, 1303.175, 958.128, 589.116 }, + new[] { "GU", "AC", "U", "G", "GUAC", "ACU", "UG" }); + // 6bp Rnase A, 2 missed cleavages + yield return new RnaDigestionTestCase("GUACUG", "RNase A", + 2, 1, 6, 9, + new[] { 669.082, 652.103, 324.035, 283.091, 1303.175, 958.128, 589.116, 1609.200, 1223.209 }, + new[] { "GU", "AC", "U", "G", "GUAC", "ACU", "UG", "GUACU", "ACUG" }); + // 20bp top-down + yield return new RnaDigestionTestCase("GUACUGCCUCUAGUGAAGCA", "top-down", + 0, 1, int.MaxValue, 1, + new[] { 6363.871 }, + new[] { "GUACUGCCUCUAGUGAAGCA" }); + // 20bp Rnase T1, normal + yield return new RnaDigestionTestCase("GUACUGCCUCUAGUGAAGCA", "RNase T1", + 0, 1, int.MaxValue, 6, + new[] { 363.057, 1609.200, 2219.282, 669.082, 1021.161, 572.137 }, + new[] { "G", "UACUG", "CCUCUAG", "UG", "AAG", "CA" }); + } + + public static string rnaseTsvpath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"Digestion\rnases.tsv"); + + [OneTimeSetUp] + public void OneTimeSetup() + { + RnaseDictionary.Dictionary = RnaseDictionary.LoadRnaseDictionary(rnaseTsvpath); + } + + #region Rnase + + [Test] + public void TestRnaseDictionaryLoading() + { + var rnaseCountFromTsv = File.ReadAllLines(rnaseTsvpath).Length - 1; + Assert.That(RnaseDictionary.Dictionary.Count, Is.EqualTo(rnaseCountFromTsv)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestRnase_GetUnmodifiedOligos_Counts(RnaDigestionTestCase testCase) + { + RNA rna = new RNA(testCase.BaseSequence); + Rnase rnase = RnaseDictionary.Dictionary[testCase.Enzyme]; + var digestionProducts = + rnase.GetUnmodifiedOligos(rna, testCase.MissedCleavages, testCase.MinLength, testCase.MaxLength); + + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestRnase_GetUnmodifiedOligo_Sequence(RnaDigestionTestCase testCase) + { + RNA rna = new RNA(testCase.BaseSequence); + Rnase rnase = RnaseDictionary.Dictionary[testCase.Enzyme]; + var digestionProducts = + rnase.GetUnmodifiedOligos(rna, testCase.MissedCleavages, testCase.MinLength, testCase.MaxLength); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var product = digestionProducts[i]; + var testCaseCaseSequence = testCase.Sequences[i]; + Assert.That(product.BaseSequence == testCaseCaseSequence); + } + } + + [Test] + public void TestRnaseEqualityProperties() + { + Rnase t1 = RnaseDictionary.Dictionary["RNase T1"]; + Rnase t1Duplicate = RnaseDictionary.Dictionary["RNase T1"]; + Rnase t2 = RnaseDictionary.Dictionary["RNase T2"]; + + Assert.That(t1.Equals(t1Duplicate)); + Assert.That(t1.Equals(t1)); + Assert.That(!t1.Equals(t2)); + Assert.That(!t1.Equals(null)); + Assert.That(t1.GetHashCode(), Is.EqualTo(t1Duplicate.GetHashCode())); + Assert.That(t1.GetHashCode(), Is.Not.EqualTo(t2.GetHashCode())); + Assert.That(t1.Equals((object)t1Duplicate)); + Assert.That(t1.Equals((object)t1)); + Assert.That(!t1.Equals((object)t2)); + Assert.That(!t1.Equals((object)null)); + // ReSharper disable once SuspiciousTypeConversion.Global + Assert.That(!t1.Equals((object)new RNA("GUA"))); + } + + [Test] + public void TestRnase_UnmodifiedOligos_Exception() + { + Rnase rnase = new Rnase("Bad", CleavageSpecificity.SingleC, new List()); + Assert.Throws(() => { rnase.GetUnmodifiedOligos(new RNA("GUACUG"), 0, 1, 6); }); + } + + #endregion + + #region NucleolyticOligo + + [Test] + public void TestNucleolyticOligoProperties_FivePrimeDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + var oligo = digestionProducts[0]; + Assert.That(oligo.BaseSequence, Is.EqualTo("G")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(1)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(1)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('U')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('-')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_ThreePrimeDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + NucleolyticOligo oligo = digestionProducts[2]; + Assert.That(oligo.BaseSequence, Is.EqualTo("CUG")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(4)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(6)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('-')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('A')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_InternalDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["RNase U2"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(3)); + + NucleolyticOligo oligo = digestionProducts[1]; + Assert.That(oligo.BaseSequence, Is.EqualTo("UA")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(2)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(3)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('C')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('G')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + [Test] + public void TestNucleolyticOligoProperties_TopDownDigestionProduct() + { + RNA rna = new("GUACUG"); + Rnase rnase = RnaseDictionary.Dictionary["top-down"]; + var digestionProducts = rnase.GetUnmodifiedOligos(rna, 0, 1, 6); + Assert.That(digestionProducts.Count, Is.EqualTo(1)); + + NucleolyticOligo oligo = digestionProducts[0]; + Assert.That(oligo.BaseSequence, Is.EqualTo("GUACUG")); + Assert.That(oligo.OneBasedStartResidue, Is.EqualTo(1)); + Assert.That(oligo.OneBasedEndResidue, Is.EqualTo(6)); + Assert.That(oligo.MissedCleavages, Is.EqualTo(0)); + Assert.That(oligo.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(oligo.NextResidue, Is.EqualTo('-')); + Assert.That(oligo.PreviousResidue, Is.EqualTo('-')); + Assert.That(oligo.ToString(), Is.EqualTo(oligo.BaseSequence)); + } + + #endregion + + #region OligoWithSetMods + + private static (string Sequence, int FragmentNumber, ProductType Type, double Mass)[] DigestFragmentTestCases => + new (string Sequence, int FragmentNumber, ProductType Type, double Mass)[] + { + ("UAG", 0, ProductType.M, 998.134), + ("UAG", 1, ProductType.aBaseLoss, 114.031), ("UAG", 2, ProductType.aBaseLoss, 420.056), + ("UAG", 1, ProductType.c, 308.031), ("UAG", 2, ProductType.c, 637.093), + ("UAG", 1, ProductType.dWaterLoss, 306.025), ("UAG", 2, ProductType.dWaterLoss, 635.077), + ("UAG", 1, ProductType.w, 443.023), ("UAG", 2, ProductType.w, 772.075), + ("UAG", 1, ProductType.y, 363.057), ("UAG", 2, ProductType.y, 692.109), + ("UAG", 1, ProductType.yWaterLoss, 345.047), ("UAG", 2, ProductType.yWaterLoss, 674.100), + + ("UCG", 0, ProductType.M, 974.123), + ("UCG", 1, ProductType.aBaseLoss, 114.031), ("UCG", 2, ProductType.aBaseLoss, 420.056), + ("UCG", 1, ProductType.c, 308.040), ("UCG", 2, ProductType.c, 613.082), + ("UCG", 1, ProductType.dWaterLoss, 306.025), ("UCG", 2, ProductType.dWaterLoss, 611.066), + ("UCG", 1, ProductType.w, 443.023), ("UCG", 2, ProductType.w, 748.064), + ("UCG", 1, ProductType.y, 363.057), ("UCG", 2, ProductType.y, 668.098), + ("UCG", 1, ProductType.yWaterLoss, 345.047), ("UCG", 2, ProductType.yWaterLoss, 650.089), + + ("UUG", 0, ProductType.M, 975.107), + ("UUG", 1, ProductType.aBaseLoss, 114.031), ("UUG", 2, ProductType.aBaseLoss, 420.056), + ("UUG", 1, ProductType.c, 308.041), ("UUG", 2, ProductType.c, 614.066), + ("UUG", 1, ProductType.dWaterLoss, 306.025), ("UUG", 2, ProductType.dWaterLoss, 612.050), + ("UUG", 1, ProductType.w, 443.023), ("UUG", 2, ProductType.w, 749.048), + ("UUG", 1, ProductType.y, 363.057), ("UUG", 2, ProductType.y, 669.082), + ("UUG", 1, ProductType.yWaterLoss, 345.047), ("UUG", 2, ProductType.yWaterLoss, 651.073), + + ("AUAG", 0, ProductType.M, 1247.220), + ("AUAG", 1, ProductType.aBaseLoss, 114.031), ("AUAG", 2, ProductType.aBaseLoss, 443.083), ("AUAG", 3, ProductType.aBaseLoss, 749.108), + ("AUAG", 1, ProductType.c, 331.068), ("AUAG", 2, ProductType.c, 637.093), ("AUAG", 3, ProductType.c, 966.146), + ("AUAG", 1, ProductType.dWaterLoss, 329.052), ("AUAG", 2, ProductType.dWaterLoss, 635.077), ("AUAG", 3, ProductType.dWaterLoss, 964.129), + ("AUAG", 1, ProductType.w, 363.057), ("AUAG", 2, ProductType.w, 692.109), ("AUAG", 3, ProductType.w, 998.134), + ("AUAG", 1, ProductType.y, 283.091), ("AUAG", 2, ProductType.y, 612.143), ("AUAG", 3, ProductType.y, 918.168), + ("AUAG", 1, ProductType.yWaterLoss, 265.081), ("AUAG", 2, ProductType.yWaterLoss, 594.134), ("AUAG", 3, ProductType.yWaterLoss, 900.159), + }; + + [Test] // test values calculated with http://rna.rega.kuleuven.be/masspec/mongo.htm + [TestCase("UAGUCGUUGAUAG", 4140.555, new[] { "UAG", "UCG", "UUG", "AUAG" }, + new[] { 998.134, 974.123, 975.107, 1247.220 })] + public static void TestDigestionAndFragmentation(string sequence, double monoMass, + string[] digestionProductSequences, double[] digestionProductMasses) + { + RNA rna = new(sequence); + Assert.That(rna.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + + // digest RNA + var digestionParams = new RnaDigestionParams("RNase T1"); + var products = rna.Digest(digestionParams, new List(), new List()) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(products.Count, Is.EqualTo(digestionProductSequences.Length)); + + // ensure digestion sequence and masses are correct + for (var index = 0; index < products.Count; index++) + { + var digestionProduct = products[index]; + Assert.That(digestionProduct.BaseSequence, Is.EqualTo(digestionProductSequences[index])); + Assert.That(digestionProduct.MonoisotopicMass, Is.EqualTo(digestionProductMasses[index]).Within(0.01)); + + List fragments = new(); + digestionProduct.Fragment(DissociationType.CID, FragmentationTerminus.Both, fragments); + + // test that fragments are correct + var fragmentsToCompare = DigestFragmentTestCases + .Where(p => p.Sequence.Equals(digestionProduct.BaseSequence)).ToList(); + for (var i = 0; i < fragments.Count; i++) + { + var fragment = fragments[i]; + var theoreticalFragment = fragmentsToCompare.FirstOrDefault(p => + p.FragmentNumber == fragment.FragmentNumber && p.Type == fragment.ProductType); + if (theoreticalFragment.Mass is 0.0 ) continue; + Assert.That(fragment.MonoisotopicMass, Is.EqualTo(theoreticalFragment.Mass).Within(0.01)); + Assert.That(fragment.FragmentNumber, Is.EqualTo(theoreticalFragment.FragmentNumber)); + Assert.That(fragment.ProductType, Is.EqualTo(theoreticalFragment.Type)); + Assert.That(fragment.FragmentNumber, Is.EqualTo(theoreticalFragment.FragmentNumber)); + if (fragment.Terminus == FragmentationTerminus.FivePrime) + Assert.That(fragment.AminoAcidPosition, Is.EqualTo(theoreticalFragment.FragmentNumber)); + else if (fragment.Terminus == FragmentationTerminus.None) + Assert.That(fragment.FragmentNumber, Is.EqualTo(0)); + else + Assert.That(fragment.AminoAcidPosition, Is.EqualTo(digestionProductSequences[index].Length - theoreticalFragment.FragmentNumber)); + } + } + } + + [Test] + [TestCase("UAGUCGUUGAUAG", new[] { "UAG", "UCG", "UUG", "AUAG" }, + new[] { 1, 4, 7, 10 }, new[] { 3, 6, 9, 13 }, new[] { '-', 'G', 'G', 'G' }, + new[] { 'U', 'U', 'A', '-' })] + public static void TestOligoWithSetMods_AAPositions(string sequence, string[] digestionProductSequences, + int[] startResidue, int[] endResidue, char[] preciousResidue, char[] nextResidue) + { + RNA rna = new RNA(sequence); + var digestionProducts = rna.Digest(new RnaDigestionParams("RNase T1"), new List(), + new List()).Select(p => (OligoWithSetMods)p).ToList(); + + Assert.That(digestionProducts.All(p => p.DigestionParams.DigestionAgent.Name == "RNase T1")); + for (var index = 0; index < digestionProducts.Count; index++) + { + var digestionProduct = digestionProducts[index]; + Assert.That(digestionProduct.BaseSequence, Is.EqualTo(digestionProductSequences[index])); + Assert.That(digestionProduct.OneBasedStartResidue, Is.EqualTo(startResidue[index])); + Assert.That(digestionProduct.OneBasedEndResidue, Is.EqualTo(endResidue[index])); + Assert.That(digestionProduct.PreviousResidue, Is.EqualTo(preciousResidue[index])); + Assert.That(digestionProduct.NextResidue, Is.EqualTo(nextResidue[index])); + } + } + + [Test] + public static void TestTermini_ThreePrimeCyclicPhosphate() + { + string sequence = "UAGUCGUUGAUAG"; + RNA rna = new RNA(sequence); + var oligoCyclicPhosphate = PtmListLoader.ReadModsFromString( + "ID Cyclic Phosphate\r\nTG X\r\nPP Oligo 3'-terminal.\r\nMT Digestion Termini\r\nCF H-2 O-1\r\nDR Unimod; 280.\r\n//", + out List<(Modification, string)> errors).First(); + var nucleicAcidCyclicPhosphate = PtmListLoader.ReadModsFromString( + "ID Cyclic Phosphate\r\nTG X\r\nPP 3'-terminal.\r\nMT Digestion Termini\r\nCF H-2 O-1\r\nDR Unimod; 280.\r\n//", + out errors).First(); + Assert.That(!errors.Any()); + + // top-down digestion, 3' terminal modification + var variableMods = new List { nucleicAcidCyclicPhosphate }; + var digestionParams = new RnaDigestionParams("top-down"); + var digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(2)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[0].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[0].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG")); + + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + Assert.That(digestionProducts[1].SequenceWithChemicalFormulas, Is.EqualTo("UAGUCGUUGAUAG[H-2O-1]")); + Assert.That(digestionProducts[1].FullSequenceWithMassShift(), Is.EqualTo("UAGUCGUUGAUAG[-18.010565]")); + + // top-down digestion, 3' oligo terminal modification + variableMods = new List { oligoCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(2)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("UAGUCGUUGAUAG[Digestion Termini:Cyclic Phosphate on X]")); + + // RNase T1 digestion, 3' terminal modification + digestionParams = new RnaDigestionParams("RNase T1"); + variableMods = new List { nucleicAcidCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(5)); + var expected = new List() + { + "UAG", "UCG", "UUG", "AUAG", "AUAG[Digestion Termini:Cyclic Phosphate on X]" + }; + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + + // RNase T1 digestion, 3' oligo terminal modification + variableMods = new List { oligoCyclicPhosphate }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(8)); + expected = new List() + { + "UAG", "UAG[Digestion Termini:Cyclic Phosphate on X]", + "UCG", "UCG[Digestion Termini:Cyclic Phosphate on X]", + "UUG", "UUG[Digestion Termini:Cyclic Phosphate on X]", + "AUAG","AUAG[Digestion Termini:Cyclic Phosphate on X]" + }; + + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + } + + [Test] + public static void TestTermini_FivePrimeLargeMod() + { + string sequence = "UAGUCGUUGAUAG"; + RNA rna = new RNA(sequence); + var oligoLargeMod = PtmListLoader.ReadModsFromString( + "ID Pfizer 5'-Cap\r\nTG X\r\nPP Oligo 5'-terminal.\r\nMT Standard\r\nCF C13H22N5O14P3\r\nDR Unimod; 280.\r\n//", + out List<(Modification, string)> errors).First(); + var nucleicAcidLargeMod = PtmListLoader.ReadModsFromString( + "ID Pfizer 5'-Cap\r\nTG X\r\nPP 5'-terminal.\r\nMT Standard\r\nCF C13H22N5O14P3\r\nDR Unimod; 280.\r\n//", + out errors).First(); + Assert.That(!errors.Any()); + + // top-down digestion, 5' terminal modification, expect two products + var variableMods = new List { nucleicAcidLargeMod }; + var digestionParams = new RnaDigestionParams("top-down"); + var digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(2)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("[Standard:Pfizer 5'-Cap on X]UAGUCGUUGAUAG")); + + // top-down digestion, 5' oligo terminal modification, expect two products + variableMods = new List { oligoLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(2)); + Assert.That(digestionProducts[0].FullSequence, Is.EqualTo("UAGUCGUUGAUAG")); + Assert.That(digestionProducts[1].FullSequence, Is.EqualTo("[Standard:Pfizer 5'-Cap on X]UAGUCGUUGAUAG")); + + // RNase T1 digestion, 5' terminal modification + digestionParams = new RnaDigestionParams("RNase T1"); + variableMods = new List { nucleicAcidLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(5)); + var expected = new List() + { + "UAG", "[Standard:Pfizer 5'-Cap on X]UAG", "UCG", "UUG", "AUAG" + }; + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + + // RNase T1 digestion, 5' oligo terminal modification + variableMods = new List { oligoLargeMod }; + digestionProducts = rna.Digest(digestionParams, new List(), variableMods) + .Select(p => (OligoWithSetMods)p).ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(8)); + expected = new List() + { + "UAG", "[Standard:Pfizer 5'-Cap on X]UAG", + "UCG", "[Standard:Pfizer 5'-Cap on X]UCG", + "UUG", "[Standard:Pfizer 5'-Cap on X]UUG", + "AUAG", "[Standard:Pfizer 5'-Cap on X]AUAG" + }; + + for (int i = 0; i < expected.Count; i++) + { + Assert.That(digestionProducts[i].FullSequence, Is.EqualTo(expected[i])); + } + } + + [Test] + [TestCase("UAGUCGUUGAUAG")] + public static void TestOligoWithSetMods_PropertiesWithTopDownDigestion(string sequence) + { + var rna = new RNA(sequence); + var oligoWithSetMods = + rna.Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + Assert.That(rna.BaseSequence, Is.EqualTo(oligoWithSetMods.BaseSequence)); + Assert.That(rna.ThreePrimeTerminus, Is.EqualTo(oligoWithSetMods.ThreePrimeTerminus)); + Assert.That(rna.FivePrimeTerminus, Is.EqualTo(oligoWithSetMods.FivePrimeTerminus)); + Assert.That(rna.ThisChemicalFormula, Is.EqualTo(oligoWithSetMods.ThisChemicalFormula)); + Assert.That(rna.Length, Is.EqualTo(oligoWithSetMods.Length)); + } + + [Test] + public static void OligoWithSetMods_CalculatedValues() + { + var rna = new RNA("GUACUG"); + var rnaFormula = rna.ThisChemicalFormula; + + string modText = "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//"; + var sodiumAdduct = PtmListLoader.ReadModsFromString(modText, out List<(Modification, string)> mods).First(); + var oligoWithSetMods = + rna.Digest(new RnaDigestionParams(), new List() { sodiumAdduct }, new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + Assert.That(oligoWithSetMods.NumMods, Is.EqualTo(1)); + Assert.That(oligoWithSetMods.NumFixedMods, Is.EqualTo(1)); + Assert.That(oligoWithSetMods.NumVariableMods, Is.EqualTo(0)); + Assert.That(oligoWithSetMods.CleavageSpecificityForFdrCategory, Is.EqualTo(CleavageSpecificity.Full)); + + var formula = oligoWithSetMods.ThisChemicalFormula; + Assert.That(formula, Is.EqualTo(rnaFormula + sodiumAdduct.ChemicalFormula)); + + var formulaToAdd = ChemicalFormula.ParseFormula("H"); + var deltaMass = formulaToAdd.MonoisotopicMass; + var oldMonoMass = oligoWithSetMods.MonoisotopicMass; + var oldMostAbundantMass = oligoWithSetMods.MostAbundantMonoisotopicMass; + + oligoWithSetMods.FivePrimeTerminus = formulaToAdd + oligoWithSetMods.FivePrimeTerminus; + + Assert.That(oligoWithSetMods.MonoisotopicMass, Is.EqualTo(oldMonoMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.MostAbundantMonoisotopicMass, Is.EqualTo(oldMostAbundantMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.ThisChemicalFormula, Is.EqualTo(formula + formulaToAdd)); + + oldMonoMass = oligoWithSetMods.MonoisotopicMass; + oldMostAbundantMass = oligoWithSetMods.MostAbundantMonoisotopicMass; + oligoWithSetMods.ThreePrimeTerminus = formulaToAdd + oligoWithSetMods.ThreePrimeTerminus; + + Assert.That(oligoWithSetMods.MonoisotopicMass, Is.EqualTo(oldMonoMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.MostAbundantMonoisotopicMass, Is.EqualTo(oldMostAbundantMass + deltaMass).Within(0.01)); + Assert.That(oligoWithSetMods.ThisChemicalFormula, Is.EqualTo(formula + formulaToAdd + formulaToAdd)); + + Assert.Throws(() => + { + var oligo = new OligoWithSetMods("GUA|GAUGUC", new Dictionary()); + }); + } + + #endregion + + #region DigestionParams + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestDigestionParams_Properties(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(RnaseDictionary.Dictionary[testCase.Enzyme])); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(testCase.MissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(testCase.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(testCase.MaxLength)); + + digestionParams.MaxModificationIsoforms = 2048; + digestionParams.MaxMods = 3; + Assert.That(digestionParams.MaxModificationIsoforms, Is.EqualTo(2048)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(3)); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()); + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + [Test] + public void TestDigestionParamsClone() + { + var digestionParams = new RnaDigestionParams("top-down", 0, 3, 20000); + var cloned = digestionParams.Clone(FragmentationTerminus.C); + + // set new terminus, all values except terminus are retained + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(cloned.DigestionAgent)); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(cloned.MaxMissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(cloned.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(cloned.MaxLength)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(cloned.MaxMods)); + Assert.That(digestionParams.FragmentationTerminus, Is.Not.EqualTo(cloned.FragmentationTerminus)); + Assert.That(digestionParams.SearchModeType, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(cloned.FragmentationTerminus, Is.EqualTo(FragmentationTerminus.C)); + + // do not set new terminus, all values are retained + cloned = digestionParams.Clone(); + Assert.That(digestionParams.DigestionAgent, Is.EqualTo(cloned.DigestionAgent)); + Assert.That(digestionParams.MaxMissedCleavages, Is.EqualTo(cloned.MaxMissedCleavages)); + Assert.That(digestionParams.MinLength, Is.EqualTo(cloned.MinLength)); + Assert.That(digestionParams.MaxLength, Is.EqualTo(cloned.MaxLength)); + Assert.That(digestionParams.MaxMods, Is.EqualTo(cloned.MaxMods)); + Assert.That(digestionParams.FragmentationTerminus, Is.EqualTo(cloned.FragmentationTerminus)); + Assert.That(digestionParams.SearchModeType, Is.EqualTo(CleavageSpecificity.Full)); + Assert.That(cloned.FragmentationTerminus, Is.EqualTo(FragmentationTerminus.Both)); + } + + #endregion + + #region NucleicAcid + + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_Counts(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()); + Assert.That(digestionProducts.Count(), Is.EqualTo(testCase.DigestionProductCount)); + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_Sequences(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()) + .ToList(); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var product = digestionProducts[i]; + var testCaseCaseSequence = testCase.Sequences[i]; + Assert.That(product.BaseSequence, Is.EqualTo(testCaseCaseSequence)); + Assert.That(product.FullSequence, Is.EqualTo(testCaseCaseSequence)); + } + } + + [Test] + [TestCaseSource(nameof(GetTestCases))] + public void TestNucleicAcid_Digestion_WithoutMods_MonoMasses(RnaDigestionTestCase testCase) + { + var rna = new RNA(testCase.BaseSequence); + var digestionParams = new RnaDigestionParams(testCase.Enzyme, testCase.MissedCleavages, testCase.MinLength, + testCase.MaxLength); + + var digestionProducts = rna.Digest(digestionParams, new List(), new List()) + .ToList(); + + Assert.That(digestionProducts.Count, Is.EqualTo(testCase.Sequences.Length)); + for (var i = 0; i < digestionProducts.Count; i++) + { + var productMass = digestionProducts[i].MonoisotopicMass; + var testCaseCaseMass = testCase.MonoMasses[i]; + Assert.That(productMass, Is.EqualTo(testCaseCaseMass).Within(0.01)); + } + } + + [Test] + public static void TestNucleicAcid_Digestion_Exception() + { + IDigestionParams digestionParams = new Proteomics.ProteolyticDigestion.DigestionParams(); + var rna = new RNA("GUACUGGUACUG"); + + try + { + var result = rna.Digest(digestionParams, new List(), new List()); + } + catch (Exception e) + { + Assert.That(e, Is.TypeOf()); + Assert.That(e.InnerException, Is.TypeOf()); + } + } + + #endregion + + #region Digestion with Modifications + + public static List SodiumAdducts => + PtmListLoader.ReadModsFromString("ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List PotassiumAdducts => + PtmListLoader.ReadModsFromString("ID Potassium\r\nMT Metal\r\nPP Anywhere.\r\nTG A or C or G or U\r\nCF K1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List TerminalSodiumAdducts => + PtmListLoader.ReadModsFromString("ID Sodium\r\nMT Metal\r\nPP 3'-terminal.\r\nTG A or C or G or U\r\nCF Na1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + public static List TerminalPotassiumAdducts => + PtmListLoader.ReadModsFromString("ID Potassium\r\nMT Metal\r\nPP 5'-terminal.\r\nTG A or C or G or U\r\nCF K1H-1\r\n" + @"//", + out List<(Modification, string)> mods).ToList(); + + [Test] + public static void TestVariableModsCountCorrect() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams() + { + MaxMods = 1, + }; + + var precursors = rna.Digest(rnaDigestionParams, new List(), SodiumAdducts) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(7)); + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + + rnaDigestionParams.MaxMods = 2; + precursors = rna.Digest(rnaDigestionParams, new List(), SodiumAdducts) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(22)); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]U[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("G[Metal:Sodium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]A[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]AC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GU[Metal:Sodium on U]ACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]C[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUA[Metal:Sodium on A]CUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]U[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G")); + Assert.That(fullSequences.Contains("GUACU[Metal:Sodium on U]G[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + } + + [Test] + public static void TestFixedModsCountCorrect() + { + var sodiumAdduct = new List() { SodiumAdducts[0] }; + + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams() + { + MaxMods = 1, + }; + var precursors = rna.Digest(rnaDigestionParams, sodiumAdduct, new List()) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.First().NumFixedMods, Is.EqualTo(1)); + Assert.That(precursors.First().FullSequence, Is.EqualTo("GUA[Metal:Sodium on A]CUG")); + Assert.That(precursors.First().MonoisotopicMass, Is.EqualTo(1896.26).Within(0.01)); + + sodiumAdduct = new List() { SodiumAdducts[2] }; + + precursors = rna.Digest(rnaDigestionParams, sodiumAdduct, new List()) + .ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.First().NumFixedMods, Is.EqualTo(2)); + Assert.That(precursors.First().FullSequence, Is.EqualTo("G[Metal:Sodium on G]UACUG[Metal:Sodium on G]")); + Assert.That(precursors.First().MonoisotopicMass, Is.EqualTo(1918.25).Within(0.01)); + } + + [Test] + public static void TestFixedAndVariableMods() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + rnaDigestionParams.MaxMods = 1; + var fixedMods = new List { PotassiumAdducts[0] }; // A + var variableMods = new List { SodiumAdducts[1] }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.All(p => p.NumFixedMods == 1)); + Assert.That(fullSequences.Contains("GUA[Metal:Potassium on A]CUG")); + Assert.That(fullSequences.Contains("GUA[Metal:Potassium on A]C[Metal:Sodium on C]UG")); + + var oneOfEach = precursors.First(p => p.FullSequence.Equals("GUA[Metal:Potassium on A]C[Metal:Sodium on C]UG")); + Assert.That(oneOfEach.NumFixedMods, Is.EqualTo(1)); + Assert.That(oneOfEach.NumVariableMods, Is.EqualTo(1)); + Assert.That(oneOfEach.NumMods, Is.EqualTo(2)); + + fixedMods = new List { PotassiumAdducts[2] }; // G + variableMods = new List { SodiumAdducts[1] }; // C + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + + fixedMods = new List { PotassiumAdducts[2] }; // G + variableMods = new List { SodiumAdducts[1], SodiumAdducts[3] }; // C, U + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(4)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACUG[Metal:Potassium on G]")); + + rnaDigestionParams.MaxMods = 2; + precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(7)); + Assert.That(precursors.All(p => p.NumFixedMods == 2)); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]ACU[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UAC[Metal:Sodium on C]U[Metal:Sodium on U]G[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]U[Metal:Sodium on U]AC[Metal:Sodium on C]UG[Metal:Potassium on G]")); + } + + /// + /// Test when one fixed and one variable mod are used and share a localization + /// expect two results, one with the fixed, and one with the variable + /// + [Test] + public static void TestFixedAndVariableMods_LocalizationOverlap() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { PotassiumAdducts[1] }; // C + var variableMods = new List { SodiumAdducts[1] }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors.Any(p => p.NumFixedMods == 1)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 0)); + Assert.That(precursors.All(p => p.NumMods == 1)); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when two variable mods are used and share a localization + /// expect three results, one unmodified, and two singly modified + /// + [Test] + public static void TestVariableMods_LocalizationOverlap() + { + var rna = new RNA("GUACUG"); + var rnaDigestionParams = new RnaDigestionParams(); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { PotassiumAdducts[1], SodiumAdducts[1] }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + // expect three results, one unmodified, and two singly modified + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(3)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when one modification is annotated in the database, out of bounds + /// expect two results, one with the fixed, and one with the variable + /// + [Test] + public static void TestDatabaseAnnotatedMods_OutOfBounds() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 23, new List() { PotassiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { }; // C + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(1)); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + Assert.That(precursors.All(p => p.NumVariableMods == 0)); + Assert.That(precursors.All(p => p.NumMods == 0)); + Assert.That(fullSequences.Contains("GUACUG")); + } + } + + /// + /// Test when one modification is annotated in the database + /// expect two results, one unmodified, and one singly modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_SingleModification() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 4, new List() { PotassiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; // C + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2)); + Assert.That(precursors[0].NumMods, Is.EqualTo(0)); + Assert.That(precursors[1].NumMods, Is.EqualTo(1)); + Assert.That(precursors[1].NumVariableMods, Is.EqualTo(1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + } + } + + /// + /// Test when two modifications are annotated in the database at the same location + /// expect three results, one unmodified, and two singly modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_LocalizationOverlap() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 4, new List() { PotassiumAdducts[1], SodiumAdducts[1] } } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(3)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Potassium on C]UG")); + Assert.That(fullSequences.Contains("GUAC[Metal:Sodium on C]UG")); + } + } + + /// + /// Test when two terminal modifications are annotated in the database + /// MaxMods 1: expect three results, one unmodified, and two singly modified + /// MaxMods 2: expect four results, one unmodified, and two singly modified, and one double modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2]} }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database + for (int i = 1; i < 3; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + // expect three results, one unmodified, and two singly modified + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.Count, Is.EqualTo(2 + i)); + Assert.That(precursors.Any(p => p.NumFixedMods == 0)); + Assert.That(precursors.Any(p => p.NumVariableMods == 1)); + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + + if (rnaDigestionParams.MaxMods != 2) continue; + Assert.That(precursors.Any(p => p.NumVariableMods == 2)); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + } + } + + /// + /// Test when two terminal modifications are annotated in the database and one database mod on first residue + /// MaxMods 1: expect four results, one unmodified, and three singly modified + /// MaxMods 2: expect seven results, one unmodified, and three singly modified, and three double modified + /// MaxMods 3: expect eight results, one unmodified, and three singly modified, and three double modified, and one triply modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueDatabaseMod() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2], PotassiumAdducts[2] } }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database and one database mod on first residue + for (int i = 1; i < 4; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { }; + + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + + switch (rnaDigestionParams.MaxMods) + { + case 1: + Assert.That(precursors.Count(), Is.EqualTo(4)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods == 1)); + break; + case 2: + Assert.That(precursors.Count(), Is.EqualTo(7)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + + case 3: + Assert.That(precursors.Count(), Is.EqualTo(8)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + } + + if (rnaDigestionParams.MaxMods >= 1) + { + + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + } + else if (rnaDigestionParams.MaxMods >= 2) + { + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); + } + else if (rnaDigestionParams.MaxMods >= 3) + { + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + } + } + } + + /// + /// Test when two terminal modifications are annotated in the database and one database mod on first residue + /// MaxMods 1: expect five results, one unmodified, and four singly modified + /// MaxMods 2: expect eleven results, one unmodified, and four singly modified, and six double modified + /// MaxMods 3: expect fifteen results, one unmodified, and four singly modified, and six double modified, and four triply modified + /// + [Test] + public static void TestDatabaseAnnotatedMods_TerminalMods_WithFirstResidueVariableMod() + { + var rnaDigestionParams = new RnaDigestionParams(); + var oneBasedModifications = new Dictionary>() + { + { 1, new List() { TerminalPotassiumAdducts[2] } }, + { 6, new List() { TerminalSodiumAdducts[2]} } + }; + var rna = new RNA("GUACUG", oneBasedPossibleLocalizedModifications: oneBasedModifications); + + // Test when two terminal modifications are annotated in the database and one database mod on first residue + for (int i = 1; i < 4; i++) + { + rnaDigestionParams.MaxMods = i; + var fixedMods = new List { }; + var variableMods = new List { PotassiumAdducts[2] }; + + var precursors = rna.Digest(rnaDigestionParams, fixedMods, variableMods) + .ToList(); + + var fullSequences = precursors.Select(p => p.FullSequence).ToList(); + Assert.That(precursors.All(p => p.NumFixedMods == 0)); + + switch (rnaDigestionParams.MaxMods) + { + case 1: + Assert.That(precursors.Count(), Is.EqualTo(5)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods == 1)); + break; + case 2: + Assert.That(precursors.Count(), Is.EqualTo(11)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + + case 3: + Assert.That(precursors.Count(), Is.EqualTo(15)); + Assert.That(precursors.Skip(1).All(p => p.NumVariableMods >= 1)); + break; + } + + if (rnaDigestionParams.MaxMods >= 1) + { + + Assert.That(fullSequences.Contains("GUACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG")); + } + else if (rnaDigestionParams.MaxMods >= 2) + { + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G]")); + } + else if (rnaDigestionParams.MaxMods >= 3) + { + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]G[Metal:Potassium on G]UACUG[Metal:Potassium on G]")); + + Assert.That(fullSequences.Contains("G[Metal:Potassium on G]UACUG[Metal:Potassium on G][Metal:Sodium on G]")); + Assert.That(fullSequences.Contains("[Metal:Potassium on G]GUACUG[Metal:Potassium on G][Metal:Sodium on G]")); + } + } + } + + [Test] + public static void TestDigestionMaxIsoforms() + { + var rna = new RNA("GUACUAGACUACAUGGUACAUCA"); + var rnaDigestionParams = new RnaDigestionParams(); + var variableMods = SodiumAdducts.Concat(PotassiumAdducts) + .Concat(TerminalPotassiumAdducts).Concat(TerminalSodiumAdducts).ToList(); + + var digestionProducts = rna.Digest(rnaDigestionParams, new List(), variableMods) + .ToList(); + Assert.That(digestionProducts.Count, Is.EqualTo(rnaDigestionParams.MaxModificationIsoforms)); + } + + #endregion + } +} diff --git a/mzLib/Test/Transcriptomics/TestFragmentation.cs b/mzLib/Test/Transcriptomics/TestFragmentation.cs new file mode 100644 index 000000000..76ddb8c3b --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestFragmentation.cs @@ -0,0 +1,244 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Transcriptomics; +using MassSpectrometry; +using Omics; +using Omics.Fragmentation; +using Omics.Fragmentation.Oligo; +using Omics.Modifications; +using Transcriptomics.Digestion; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestFragmentation + { + + public static IEnumerable GetSixMerIndividualFragmentTypeTestCases() => + TestNucleicAcid.GetSixmerIndividualFragmentTypeTestCases(); + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestGetNeutralFragments(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + var neutralFragments = rna.GetNeutralFragments(testCase.Type).ToList(); + for (int i = 1; i < neutralFragments.Count; i++) + { + Assert.That(neutralFragments[i].NeutralMass, Is.EqualTo(testCase.NeutralMasses[i]).Within(0.01)); + } + } + + + private static IEnumerable ImplementedDissociationTypes + { + get + { + Loaders.LoadElements(); + foreach (var type in DissociationTypeCollection.AllImplementedDissociationTypes) + yield return type; + } + } + + /// + /// This test makes the assumption that the M ion is a component of all product types + /// + /// + [Test] + [TestCaseSource(nameof(ImplementedDissociationTypes))] + public void TestFragmentation_Unmodified_ProductCountsAreCorrect(DissociationType type) + { + Loaders.LoadElements(); + List products = new(); + var rnaToTest = new List + { + new RNA("GUACUG"), + new RNA("GUACUGCACUGU"), + new RNA("GUACUGUAAUGAGACUAGUACAUGACAUG"), + }; + var terminiToTest = new List { FragmentationTerminus.Both, FragmentationTerminus.FivePrime, FragmentationTerminus.ThreePrime }; + var potentialProducts = type.GetRnaProductTypesFromDissociationType(); + + // test with top down digestion and no modifications + var digestionparams = new RnaDigestionParams(rnase: "top-down"); + var fixedMods = new List(); + var variableMods = new List(); + foreach (var term in terminiToTest) + { + foreach (var oligoWithSetMods in rnaToTest.Select(rna => rna.Digest(digestionparams, fixedMods, variableMods).First())) + { + var terminalSpecifc = term == FragmentationTerminus.Both + ? potentialProducts + : potentialProducts.Where(p => p.GetRnaTerminusType() == term).ToList(); + + var expectedProductCount = term == FragmentationTerminus.Both + ? (oligoWithSetMods.Length - 1) * (terminalSpecifc.Count - 1) + 1 // there is only one M ion, so for both, remove that form muliplier and add one + : (oligoWithSetMods.Length - 1) * terminalSpecifc.Count; + + oligoWithSetMods.Fragment(type, term, products); + Assert.That(products.Count, Is.EqualTo(expectedProductCount)); + Assert.That(products.All(p => terminalSpecifc.Contains(p.ProductType))); + } + } + } + + [Test] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.a, + new[] { 267.089, 573.114, 902.167, 1207.208, 1513.233 }, + new[] { 267.089, 573.114, 902.167 + 21.982, 1207.208 + 21.982, 1513.233 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.b, + new[] { 283.084, 589.109, 918.162, 1223.203, 1529.228 }, + new[] { 283.084, 589.109, 918.162 + 21.982, 1223.203 + 21.982, 1529.228 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.c, + new[] { 347.055, 653.081, 982.133, 1287.174, 1593.2 }, + new[] { 347.055, 653.081, 982.133 + 21.982, 1287.174 + 21.982, 1593.2 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.d, + new[] { 363.05, 669.075, 998.128, 1303.169, 1609.195 }, + new[] { 363.05, 669.075, 998.128 + 21.982, 1303.169 + 21.982, 1609.195 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.dWaterLoss, + new[] { 345.039, 651.064, 980.116, 1285.157, 1591.184 }, + new[] { 345.039, 651.064, 980.116 + 21.982, 1285.157 + 21.982, 1591.184 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.w, + new[] { 363.049, 669.074, 974.115, 1303.169, 1609.195 }, + new[] { 363.049, 669.074, 974.115, 1303.169 + 21.982, 1609.195 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.x, + new[] { 347.055, 653.081, 958.122, 1287.174, 1593.2 }, + new[] { 347.055, 653.081, 958.122, 1287.174 + 21.982, 1593.2 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.y, + new[] { 283.084, 589.109, 894.15, 1223.203, 1529.228 }, + new[] { 283.084, 589.109, 894.15, 1223.203 + 21.982, 1529.228 + 21.982 })] + [TestCase("GUACUG", "ID Sodium\r\nMT Metal\r\nPP Anywhere.\r\nTG A\r\nCF Na1H-1\r\n" + @"//", + "GUA[Metal:Sodium on A]CUG", 1874.28, 1896.26, ProductType.z, + new[] { 267.089, 573.124, 878.156, 1207.208, 1513.233 }, + new[] { 267.089, 573.124, 878.156, 1207.208 + 21.982, 1513.233 + 21.982 })] + public void TestFragmentation_Modified(string sequence, string modString, string fullSequence, double unmodifiedMass, double modifiedMass, + ProductType productType, double[] unmodifiedFragmentMass, double[] modifiedFragmentMasses) + { + var mods = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).ToList(); + var modDict = mods.ToDictionary(p => p.IdWithMotif, p => p); + var rna = new RNA(sequence); + + var unmodifiedOligo = new OligoWithSetMods(sequence, new Dictionary(), + 0, new RnaDigestionParams(), rna, 1, rna.Length); + Assert.That(unmodifiedOligo.AllModsOneIsNterminus.Count, Is.EqualTo(0)); + Assert.That(unmodifiedOligo.FullSequence, Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.SequenceWithChemicalFormulas, Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.FullSequenceWithMassShift(), Is.EqualTo(sequence)); + Assert.That(unmodifiedOligo.MonoisotopicMass, Is.EqualTo(unmodifiedMass).Within(0.01)); + + var modifiedOligo = new OligoWithSetMods(fullSequence, modDict, + 0, new RnaDigestionParams(), rna, 1, rna.Length); + var formulaSequence = fullSequence.Replace("Metal:Sodium on A", "H-1Na"); + var massShiftSequence = fullSequence.Replace("Metal:Sodium on A", "+21.981944"); + Assert.That(modifiedOligo.AllModsOneIsNterminus.Count, Is.EqualTo(mods.Count)); + Assert.That(modifiedOligo.FullSequence, Is.EqualTo(fullSequence)); + Assert.That(modifiedOligo.SequenceWithChemicalFormulas, Is.EqualTo(formulaSequence)); + Assert.That(modifiedOligo.FullSequenceWithMassShift(), Is.EqualTo(massShiftSequence)); + Assert.That(modifiedOligo.MonoisotopicMass, Is.EqualTo(modifiedMass).Within(0.01)); + + var unmodifiedProducts = unmodifiedOligo.GetNeutralFragments(productType).ToList(); + Assert.That(unmodifiedProducts.Count, Is.EqualTo(5)); + var modifiedProducts = modifiedOligo.GetNeutralFragments(productType).ToList(); + Assert.That(modifiedProducts.Count, Is.EqualTo(5)); + + + for (int i = 0; i < unmodifiedProducts.Count; i++) + { + var unModifedProduct = unmodifiedProducts[i]; + var modifiedProduct = modifiedProducts[i]; + + Assert.That(unModifedProduct.NeutralMass, Is.EqualTo(unmodifiedFragmentMass[i]).Within(0.01)); + Assert.That(modifiedProduct.NeutralMass, Is.EqualTo(modifiedFragmentMasses[i]).Within(0.01)); + } + } + + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestRnaFragments(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + List products = rna.GetNeutralFragments(testCase.Type).Select(p => (Product)p).ToList(); + + for (int i = 0; i < products.Count; i++) + { + var product = products[i]; + Assert.That(testCase.Type, Is.EqualTo(product.ProductType)); + Assert.That(testCase.Type.GetRnaTerminusType(), Is.EqualTo(product.Terminus)); + Assert.That(testCase.NeutralMasses[i], Is.EqualTo(product.NeutralMass).Within(0.01)); + Assert.That(testCase.NeutralMasses[i], Is.EqualTo(product.MonoisotopicMass).Within(0.01)); + Assert.That(0, Is.EqualTo(product.NeutralLoss)); + Assert.That(null, Is.EqualTo(product.SecondaryProductType)); + Assert.That(0, Is.EqualTo(product.SecondaryFragmentNumber)); + + string annotation = $"{product.ProductType}{product.FragmentNumber}"; + Assert.That(annotation, Is.EqualTo(product.Annotation)); + string toString = + $"{product.ProductType}{product.FragmentNumber};{product.NeutralMass:F5}-{product.NeutralLoss:0.##}"; + Assert.That(toString, Is.EqualTo(product.ToString())); + } + } + + [Test] + [TestCaseSource(nameof(GetSixMerIndividualFragmentTypeTestCases))] + public void TestRnaFragmentNumbers(TestNucleicAcid.SixmerTestCase testCase) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + List products = rna.GetNeutralFragments(testCase.Type).Select(p => (Product)p).ToList(); + + for (int i = 0; i < products.Count; i++) + { + var product = products[i]; + bool isThreePrime = product.ProductType.GetRnaTerminusType() == FragmentationTerminus.ThreePrime; + + int fragmentNumber = i + 1; + int residuePosition = isThreePrime ? rna.Length - fragmentNumber : fragmentNumber; + + Assert.That(product.FragmentNumber, Is.EqualTo(fragmentNumber)); + Assert.That(product.ResiduePosition, Is.EqualTo(residuePosition)); + } + + } + + [Test] + public void TestConstructorAndEquality() + { + Product product1 = new Product(ProductType.d, FragmentationTerminus.FivePrime, 200, 4, 4, 0.0); + Product product2 = new Product(ProductType.d, FragmentationTerminus.FivePrime, 200, 4, 4, 0.0); + Product uniqueProduct = new Product(ProductType.a, FragmentationTerminus.FivePrime, 201, 4, 4, 0.0); + + Assert.That(product1.Equals(product1)); + Assert.That(product1.Equals(product2)); + Assert.That(product1.GetHashCode(), Is.EqualTo(product2.GetHashCode())); + Assert.That(!product1.Equals(uniqueProduct)); + Assert.That(!product1.Equals(null)); + Assert.That(product1.GetHashCode(), Is.Not.EqualTo(uniqueProduct.GetHashCode())); + + Assert.That(product1.Equals((object)product1)); + Assert.That(product1.Equals((object)product2)); + Assert.That(!product1.Equals((object)uniqueProduct)); + Assert.That(!product1.Equals((object)new Product(ProductType.d, FragmentationTerminus.N, 200, 4, 4, 0.0))); + Assert.That(!product1.Equals((object)null)); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestNucleicAcid.cs b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs new file mode 100644 index 000000000..47e98d708 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestNucleicAcid.cs @@ -0,0 +1,174 @@ +using NUnit.Framework.Legacy; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Chemistry; +using Omics.Fragmentation; +using Transcriptomics; +using UsefulProteomicsDatabases; + +namespace Test.Transcriptomics +{ + /// + /// Test Data generated with http://rna.rega.kuleuven.be/masspec/mongo.htm + /// + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestNucleicAcid + { + public record SixmerTestCase(string Sequence, ProductType Type, double[] NeutralMasses, string[] ChemicalFormulas); + + public static IEnumerable GetSixmerIndividualFragmentTypeTestCases() + { + Loaders.LoadElements(); + + yield return new SixmerTestCase("GUACUG", ProductType.a, + new[] { 267.089, 573.114, 902.167, 1207.208, 1513.233 }, + new[] { "C10H13N5O4", "C19H24N7O12P", "C29H36N12O18P2", "C38H48N15O25P3", "C47H59N17O33P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.b, + new[] { 283.084, 589.109, 918.162, 1223.203, 1529.228 }, + new[] { "C10H13N5O5", "C19H24N7O13P", "C29H36N12O19P2", "C38H48N15O26P3", "C47H59N17O34P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.c, + new[] { 347.055, 653.081, 982.133, 1287.174, 1593.2 }, + new[] { "C10H14N5O7P", "C19H25N7O15P2", "C29H37N12O21P3", "C38H49N15O28P4", "C47H60N17O36P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.d, + new[] { 363.05, 669.075, 998.128, 1303.169, 1609.195 }, + new[] { "C10H14N5O8P", "C19H25N7O16P2", "C29H37N12O22P3", "C38H49N15O29P4", "C47H60N17O37P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.dWaterLoss, + new[] { 345.039, 651.064, 980.116, 1285.157, 1591.184 }, + new[] { "C10H12N5O7P", "C19H23N7O15P2", "C29H35N12O21P3", "C38H47N15O28P4", "C47H58N17O36P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.w, + new[] { 363.049, 669.074, 974.115, 1303.169, 1609.195 }, + new[] { "C10H14N5O8P", "C19H25N7O16P2", "C28H37N10O23P3", "C38H49N15O29P4", "C47H60N17O37P5", }); + yield return new SixmerTestCase("GUACUG", ProductType.x, + new[] { 347.055, 653.081, 958.122, 1287.174, 1593.2 }, + new[] { "C10H14N5O7P", "C19H25N7O15P2", "C28H37N10O22P3", "C38H49N15O28P4", "C47H60N17O36P5" }); + yield return new SixmerTestCase("GUACUG", ProductType.y, + new[] { 283.084, 589.109, 894.15, 1223.203, 1529.228 }, + new[] { "C10H13N5O5", "C19H24N7O13P", "C28H36N10O20P2", "C38H48N15O26P3", "C47H59N17O34P4", }); + yield return new SixmerTestCase("GUACUG", ProductType.z, + new[] { 267.089, 573.124, 878.156, 1207.208, 1513.233 }, + new[] { "C10H13N5O4", "C19H24N7O12P", "C28H36N10O19P2", "C38H48N15O25P3", "C47H59N17O33P4", }); + + + yield return new SixmerTestCase("GUACUG", ProductType.aBaseLoss, + new[] { 114.03, 459.07, 765.095, 1094.147, 1399.198 }, + new[] { "C5H6O3", "C15H18N5O10P", "C24H29N7O18P2", "C34H41N12O24P3", "C43H53N15O31P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.bBaseLoss, + new[] { 130.027, 475.074, 781.099, 1110.152, 1415.193 }, + new[] { "C5H6O4", "C15H18N5O11P", "C24H29N7O19P2", "C34H41N12O25P3", "C43H53N15O32P4" }); + yield return new SixmerTestCase("GUACUG", ProductType.cBaseLoss, + new[] { 193.998, 539.045, 845.071, 1174.123, 1479.164 }, + new[] { "C5H7O6P", "C15H19N5O13P2", "C24H30N7O21P3", "C34H42N12O27P4", "C43H54N15O34P5" }); + yield return new SixmerTestCase("GUACUG", ProductType.dBaseLoss, + new[] { 209.993, 555.04, 861.066, 1190.118, 1495.16 }, + new[] { "C5H7O7P", "C15H19N5O14P2", "C24H30N7O22P3", "C34H42N12O28P4", "C43H54N15O35P5" }); + + // TODO: Add water loss besides d-H2O + } + + + [Test] + [TestCase("GUACUG", 1874.281)] + [TestCase("A", 267.096)] + [TestCase("C", 243.085)] + [TestCase("U", 244.069)] + [TestCase("G", 283.091)] + [TestCase("GU", 589.116)] + [TestCase("AAA", 925.200)] + [TestCase("CCC", 853.166)] + [TestCase("UUU", 856.119)] + [TestCase("GGG", 973.185)] + public void TestConstructorsAndEquality(string sequence, double monoMass) + { + // test constructors and equality + RNA rna = new RNA(sequence); + + Assert.That(rna.Length, Is.EqualTo(sequence.Length)); + Assert.That(rna.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.GetChemicalFormula().MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.NucleicAcidArray.Length, Is.EqualTo(sequence.Length)); + CollectionAssert.AreEqual(rna.NucleicAcidArray.Select(p => p.Letter), sequence); + Assert.That(rna.FivePrimeTerminus.Equals(NucleicAcid.DefaultFivePrimeTerminus)); + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + rna.ThreePrimeTerminus = rna.ThreePrimeTerminus; + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + + List nucList = new(); + foreach (var nucleotide in sequence) + { + nucList.Add(Nucleotide.GetResidue(nucleotide)); + } + Assert.That(rna.NucleicAcidArray.SequenceEqual(nucList.ToArray())); + + var rna2 = new RNA(sequence, NucleicAcid.DefaultFivePrimeTerminus, NucleicAcid.DefaultThreePrimeTerminus); + + Assert.That(rna2.Length, Is.EqualTo(sequence.Length)); + Assert.That(rna2.MonoisotopicMass, Is.EqualTo(monoMass).Within(0.01)); + Assert.That(rna.FivePrimeTerminus.Equals(NucleicAcid.DefaultFivePrimeTerminus)); + Assert.That(rna.ThreePrimeTerminus.Equals(NucleicAcid.DefaultThreePrimeTerminus)); + nucList.Clear(); + foreach (var nucleotide in sequence) + { + nucList.Add(Nucleotide.GetResidue(nucleotide)); + } + Assert.That(rna.NucleicAcidArray.SequenceEqual(nucList.ToArray())); + + Assert.That(rna.Equals(rna2)); + Assert.That(rna.Equals(rna)); + Assert.That(!rna.Equals(null)); + Assert.That(rna.Equals((object)rna2)); + Assert.That(rna.Equals((object)rna)); + Assert.That(!rna.Equals((object)null)); + Assert.That(!rna.Equals((object)new Double())); + } + + [Test] + public void TestParseSequence() + { + var rna1 = new RNA("GUACUG"); + var rna2 = new RNA("GU ACU G"); + var rna3 = new RNA("GU*ACU*G"); + + Assert.That(rna1.BaseSequence, Is.EqualTo(rna2.BaseSequence)); + Assert.That(rna1.BaseSequence, Is.EqualTo(rna3.BaseSequence)); + Assert.That(rna1.GetHashCode(), Is.EqualTo(rna3.GetHashCode())); + Assert.That(rna1.GetHashCode(), Is.EqualTo(rna3.GetHashCode())); + Assert.That(rna1.Length, Is.EqualTo(rna3.Length)); + Assert.That(rna1.Length, Is.EqualTo(rna3.Length)); + + Assert.Throws(() => new RNA("GUA~CUG")); + } + + [Test] + [TestCase("GUACUG", new[] { -1, -2, -3, -4, -5 }, new[] { 1873.273, 936.133, 623.752, 467.562, 373.848 })] + public void TestElectroSpraySeries(string sequence, int[] charges, double[] mzs) + { + RNA rna = new(sequence); + + var esiSeries = rna.GetElectrospraySeries(charges.First(), charges.Last()).ToArray(); + for (int j = 0; j < mzs.Length; j++) + { + var ion = esiSeries[j]; + Assert.That(ion, Is.EqualTo(mzs[j]).Within(0.01)); + } + } + + [Test] + [TestCase("GUACUG", new[] { -1, -2, -3, -4, -5, -6 }, new[] { 1953.239, 976.116, 650.408, 487.554, 389.841, 324.700 })] + public void TestReplaceTerminusWithElectroSpraySeries(string sequence, int[] charges, double[] mzs) + { + RNA rna = new("GUACUG"); + rna.FivePrimeTerminus = ChemicalFormula.ParseFormula("H1"); + + var esiSeries = rna.GetElectrospraySeries(charges.Last(), charges.First()).ToArray(); + for (int j = 0; j < mzs.Length; j++) + { + var ion = esiSeries[j]; + Assert.That(ion, Is.EqualTo(mzs[j]).Within(0.01)); + } + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestNucleotide.cs b/mzLib/Test/Transcriptomics/TestNucleotide.cs index df250fd40..277ebc3d6 100644 --- a/mzLib/Test/Transcriptomics/TestNucleotide.cs +++ b/mzLib/Test/Transcriptomics/TestNucleotide.cs @@ -9,12 +9,12 @@ namespace Test.Transcriptomics { [ExcludeFromCodeCoverage] - internal class TestNucleotide + public class TestNucleotide { - internal record NucleotideTestCase(Nucleotide Nucleotide, string Name, char OneLetterCode, string Symbol, ChemicalFormula Formula, double Mass, + public record NucleotideTestCase(Nucleotide Nucleotide, string Name, char OneLetterCode, string Symbol, ChemicalFormula Formula, double Mass, ChemicalFormula nucleosideFormula); - internal static IEnumerable GetNucleotideTestCases() + public static IEnumerable GetNucleotideTestCases() { Loaders.LoadElements(); diff --git a/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs b/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs new file mode 100644 index 000000000..2da62943f --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestOligoWithSetMods.cs @@ -0,0 +1,175 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using NUnit.Framework; +using Omics.Modifications; +using Transcriptomics.Digestion; +using Transcriptomics; +using Omics; + +namespace Test.Transcriptomics +{ + [ExcludeFromCodeCoverage] + public static class TestOligoWithSetMods + { + [Test] + [TestCase( 0, 1, 20.45)] + [TestCase(1, 1, 20.45)] + [TestCase( 0, 2, 20.45)] + [TestCase(1, 2, 20.45)] + [TestCase( 0, 5, 28.37)] + [TestCase(1, 5, 28.37)] + [TestCase( 0, 6, 28.37)] + [TestCase(1, 6, 28.37)] + public static void TestLocalize(int modsOnOligo, int indexOfMass, double massToLocalize) + { + var oligoWithSetMods = new RNA("GUACUG", + oneBasedPossibleLocalizedModifications: new Dictionary> { { 4, [TestDigestion.PotassiumAdducts[1]] } }) + .Digest(new RnaDigestionParams(), [], []) + .ElementAt(modsOnOligo); + + Assert.That(oligoWithSetMods.AllModsOneIsNterminus.Count, Is.EqualTo(modsOnOligo)); + + // Act + var localizedOligo = oligoWithSetMods.Localize(indexOfMass - 2, massToLocalize); + + // Assert + int expectedModificationCount; + double expectedMass; + if (modsOnOligo == 1) // if the oligo started with a mod + { + int indexOfOriginalMod = oligoWithSetMods.AllModsOneIsNterminus.Keys.First(); + + // ensure original modification exist + Assert.That(localizedOligo.AllModsOneIsNterminus.ContainsKey(indexOfOriginalMod)); + + if (indexOfOriginalMod != indexOfMass) // Additional mass was added to a different location + { + expectedModificationCount = modsOnOligo + 1; + expectedMass = massToLocalize; + + // ensure original modification is still intact + Assert.That(oligoWithSetMods.OneBasedPossibleLocalizedModifications[indexOfOriginalMod][0].MonoisotopicMass, + Is.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfOriginalMod].MonoisotopicMass)); + } + else // Additional mass was added to the location of an existing modification + { + expectedModificationCount = modsOnOligo; + expectedMass = massToLocalize + TestDigestion.PotassiumAdducts[1].MonoisotopicMass!.Value; + + // ensure original modification has been altered + Assert.That(oligoWithSetMods.OneBasedPossibleLocalizedModifications[indexOfOriginalMod][0].MonoisotopicMass, + Is.Not.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfOriginalMod].MonoisotopicMass)); + } + } + else // oligo started with no modifications + { + expectedModificationCount = modsOnOligo + 1; + expectedMass = massToLocalize; + } + + + Assert.That(expectedModificationCount, Is.EqualTo(localizedOligo.AllModsOneIsNterminus.Count)); + Assert.That(localizedOligo.AllModsOneIsNterminus.ContainsKey(indexOfMass)); + Assert.That(expectedMass, Is.EqualTo(localizedOligo.AllModsOneIsNterminus[indexOfMass].MonoisotopicMass)); + } + + [Test] + public static void TestEquality() + { + var modDict = new Dictionary> { { 4, [TestDigestion.PotassiumAdducts[1]] } }; + var oligoWithSetMods = new RNA("GUACUG", + oneBasedPossibleLocalizedModifications: modDict) + .Digest(new RnaDigestionParams(), [], []) + .ElementAt(1); + + IBioPolymerWithSetMods oligoWithSetMods2 = new RNA("GUACUG", + oneBasedPossibleLocalizedModifications: modDict) + .Digest(new RnaDigestionParams(), [], []) + .ElementAt(1); + + // same oligos + Assert.That(oligoWithSetMods.Equals(oligoWithSetMods2)); + Assert.That(oligoWithSetMods.Equals((object)oligoWithSetMods2)); + Assert.That(oligoWithSetMods.Equals((OligoWithSetMods)oligoWithSetMods2)); + Assert.That(oligoWithSetMods.Equals(oligoWithSetMods)); + Assert.That(oligoWithSetMods.Equals((object)oligoWithSetMods)); + Assert.That(oligoWithSetMods.Equals((OligoWithSetMods)oligoWithSetMods)); + Assert.That(oligoWithSetMods.GetHashCode(), Is.EqualTo(oligoWithSetMods2.GetHashCode())); + + // all fail on null + Assert.That(!oligoWithSetMods2.Equals(null)); + Assert.That(!oligoWithSetMods2.Equals((object)null)); + Assert.That(!oligoWithSetMods2.Equals((OligoWithSetMods)null)); + + // Null parent checks + oligoWithSetMods = new(oligoWithSetMods.FullSequence, modDict.ToDictionary(p => p.Value.First().IdWithMotif, p => p.Value.First())); + oligoWithSetMods2 = new OligoWithSetMods(oligoWithSetMods.FullSequence, modDict.ToDictionary(p => p.Value.First().IdWithMotif, p => p.Value.First())); + var oligoWithSetMods3 = new OligoWithSetMods(oligoWithSetMods.FullSequence + "AGAUA", modDict.ToDictionary(p => p.Value.First().IdWithMotif, p => p.Value.First())); + + // same oligo null parent + Assert.That(oligoWithSetMods.Equals(oligoWithSetMods2)); + Assert.That(oligoWithSetMods.Equals((object)oligoWithSetMods2)); + Assert.That(oligoWithSetMods.Equals((OligoWithSetMods)oligoWithSetMods2)); + + // different oligo null parent + Assert.That(!oligoWithSetMods.Equals(oligoWithSetMods3)); + Assert.That(!oligoWithSetMods.Equals((object)oligoWithSetMods3)); + Assert.That(!oligoWithSetMods.Equals((IBioPolymerWithSetMods)oligoWithSetMods3)); + } + + [Test] + [TestCase("GUACUG", "GUACUGGUACUG", "RNase A")] + [TestCase("GUAGGAG", "GUAGCAG", "RNase A")] + public static void TestInequality_DifferentParentSameDigestionProduct(string sequence1, string sequence2, string enzyme) + { + var digestionParams = new RnaDigestionParams(rnase: enzyme, minLength: 1, maxMissedCleavages: 0); + + var oligo1 = new RNA(sequence1, "", "rna1", "", "") + .Digest(digestionParams, [], []) + .First(); + + var oligo2 = new RNA(sequence2, "", "rna3", "", "") + .Digest(digestionParams, [], []) + .First(); + + Assert.That(oligo1, Is.Not.EqualTo(oligo2)); + Assert.That(oligo1.Equals(oligo1)); + Assert.That(oligo1, Is.Not.EqualTo((object)oligo2)); + Assert.That(oligo1.GetHashCode(), Is.Not.EqualTo(oligo2.GetHashCode())); + } + + /// + /// The purpose of this test is to ensure that two oligos digested from two different rnases are not equal even if their sequences are equal + /// This is important for multiprotease parsimony in MetaMorpheus + /// + [Test] + [TestCase("AUAGUCUGG", "RNase T1", "colicin_E5")] + [TestCase("AUAGUCUGGGAUCUG", "RNase T1", "colicin_E5")] + public static void TestInequality_SameParentAndDigestionProduct_DifferentRnases(string sequence, string enzyme1, string enzyme2) + { + var digestionParams1 = new RnaDigestionParams(rnase: enzyme1, minLength: 1, maxMissedCleavages: 0); + var digestionParams2 = new RnaDigestionParams(rnase: enzyme2, minLength: 1, maxMissedCleavages: 0); + + var oligo1 = new RNA(sequence) + .Digest(digestionParams1, [], []) + .ToArray(); + + var oligo2 = new RNA(sequence) + .Digest(digestionParams2, [], []) + .ToArray(); + + Assert.That(oligo1.Length, Is.Not.EqualTo(oligo2.Length)); + + Assert.That(oligo1.First().BaseSequence, Is.EqualTo("AUAG")); + Assert.That(oligo2.First().BaseSequence, Is.EqualTo("AUAG")); + + Assert.That(oligo1, Is.Not.EqualTo(oligo2)); + Assert.That(oligo1, Is.Not.EqualTo((object)oligo2)); + Assert.That(oligo1.GetHashCode(), Is.Not.EqualTo(oligo2.GetHashCode())); + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestProductType.cs b/mzLib/Test/Transcriptomics/TestProductType.cs new file mode 100644 index 000000000..15757f4d2 --- /dev/null +++ b/mzLib/Test/Transcriptomics/TestProductType.cs @@ -0,0 +1,278 @@ +using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Chemistry; +using MassSpectrometry; +using NUnit.Framework; +using NUnit.Framework.Legacy; +using Omics.Fragmentation; +using Omics.Fragmentation.Oligo; +using Omics.Modifications; +using Transcriptomics; +using Transcriptomics.Digestion; + +namespace Test.Transcriptomics +{ + [TestFixture] + [ExcludeFromCodeCoverage] + public class TestProductType + { + [Test] + [TestCase(DissociationType.HCD, new[] { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, + ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M })] + [TestCase(DissociationType.CID, new[] { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, + ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M })] + public void TestProductTypes_Dissociation(DissociationType dissociation, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, dissociation.GetRnaProductTypesFromDissociationType()); + } + + [Test] + [TestCase(FragmentationTerminus.FivePrime, new[] + { + ProductType.a, ProductType.aWaterLoss, ProductType.aBaseLoss, + ProductType.b, ProductType.bWaterLoss, ProductType.bBaseLoss, + ProductType.c, ProductType.cWaterLoss, ProductType.cBaseLoss, + ProductType.d, ProductType.dWaterLoss, ProductType.dBaseLoss, + })] + [TestCase(FragmentationTerminus.ThreePrime, new[] + { + ProductType.w, ProductType.wWaterLoss, ProductType.wBaseLoss, + ProductType.x, ProductType.xWaterLoss, ProductType.xBaseLoss, + ProductType.y, ProductType.yWaterLoss, ProductType.yBaseLoss, + ProductType.z, ProductType.zWaterLoss, ProductType.zBaseLoss, + })] + public void TestProductTypes_Terminus(FragmentationTerminus terminus, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, terminus.GetRnaTerminusSpecificProductTypes()); + } + + [Test] + [TestCase(DissociationType.HCD, FragmentationTerminus.FivePrime, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, })] + [TestCase(DissociationType.HCD, FragmentationTerminus.ThreePrime, new[] + { ProductType.w, ProductType.x, ProductType.y, ProductType.z, })] + [TestCase(DissociationType.HCD, FragmentationTerminus.Both, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.b, ProductType.c, ProductType.d, ProductType.dWaterLoss, ProductType.w, ProductType.x, ProductType.y, ProductType.z, ProductType.M })] + [TestCase(DissociationType.CID, FragmentationTerminus.FivePrime, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss })] + [TestCase(DissociationType.CID, FragmentationTerminus.ThreePrime, new[] + { ProductType.w, ProductType.y, ProductType.yWaterLoss })] + [TestCase(DissociationType.CID, FragmentationTerminus.Both, new[] + { ProductType.a, ProductType.aBaseLoss, ProductType.c, ProductType.dWaterLoss, ProductType.w, ProductType.y, ProductType.yWaterLoss, ProductType.M })] + public void TestProductTypes_TerminusAndDissociation(DissociationType dissociation, FragmentationTerminus terminus, ProductType[] products) + { + CollectionAssert.AreEquivalent(products, dissociation.GetRnaTerminusSpecificProductTypesFromDissociation(terminus)); + } + + [Test] + public static void Test_NeutralMassShiftFromProductType() + { + foreach (ProductType p in Enum.GetValues(typeof(ProductType))) + { + double mass = 0; + switch (p) + { + case ProductType.a: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.b: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("OH").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.c: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3H2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.x: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-1H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.y: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-3P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.zWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-5H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.aWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-1O-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.aBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.bBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O1H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.cWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.cBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3H-1P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.d: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O4H2P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.dWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O3P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.dBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O4H-1P").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + + case ProductType.w: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.wWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-1O-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.xWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-2H-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.yWaterLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.z: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + + case ProductType.wBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.xBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-1H-2").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.yBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-3H-2P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + case ProductType.zBaseLoss: + mass = p.GetRnaMassShiftFromProductType().RoundedDouble(2).Value; + Assert.That(ChemicalFormula.ParseFormula("O-4H-3P-1").MonoisotopicMass.RoundedDouble(2).Value, Is.EqualTo(mass)); + break; + } + } + } + + [Test] + public void TestProductTypes_GetRnaTerminusType() + { + foreach (var type in Enum.GetValues()) + { + switch (type) + { + case ProductType.a: + case ProductType.aWaterLoss: + case ProductType.aBaseLoss: + case ProductType.b: + case ProductType.bWaterLoss: + case ProductType.bBaseLoss: + case ProductType.c: + case ProductType.cWaterLoss: + case ProductType.cBaseLoss: + case ProductType.d: + case ProductType.dWaterLoss: + case ProductType.dBaseLoss: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.FivePrime)); + break; + + case ProductType.w: + case ProductType.wWaterLoss: + case ProductType.wBaseLoss: + case ProductType.x: + case ProductType.xWaterLoss: + case ProductType.xBaseLoss: + case ProductType.y: + case ProductType.yWaterLoss: + case ProductType.yBaseLoss: + case ProductType.z: + case ProductType.zWaterLoss: + case ProductType.zBaseLoss: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.ThreePrime)); + break; + + case ProductType.M: + Assert.That(type.GetRnaTerminusType(), Is.EqualTo(FragmentationTerminus.Both)); + break; + + case ProductType.aStar: + case ProductType.bAmmoniaLoss: + case ProductType.D: + case ProductType.Ycore: + case ProductType.Y: + case ProductType.aDegree: + case ProductType.yAmmoniaLoss: + case ProductType.zPlusOne: + case ProductType.zDot: + Assert.Throws(() => type.GetRnaTerminusType()); + break; + default: + throw new ArgumentOutOfRangeException(); + } + } + } + + [Test] + [TestCase(ProductType.a, ProductType.aWaterLoss)] + [TestCase(ProductType.b, ProductType.bWaterLoss)] + [TestCase(ProductType.c, ProductType.cWaterLoss)] + [TestCase(ProductType.d, ProductType.dWaterLoss)] + [TestCase(ProductType.w, ProductType.wWaterLoss)] + [TestCase(ProductType.x, ProductType.xWaterLoss)] + [TestCase(ProductType.y, ProductType.yWaterLoss)] + [TestCase(ProductType.z, ProductType.zWaterLoss)] + public void EnsureWaterLossMassesAreCorrect(ProductType normal, ProductType waterLoss) + { + var rna = new RNA("GUACUG") + .Digest(new RnaDigestionParams(), new List(), new List()) + .First() as OligoWithSetMods ?? throw new NullReferenceException(); + + List normalFragments = rna.GetNeutralFragments(normal).ToList(); + List waterLossFragments = rna.GetNeutralFragments(waterLoss).ToList(); + for (var index = 0; index < waterLossFragments.Count; index++) + { + var waterLossFragment = waterLossFragments[index]; + var normalFragment = normalFragments[index]; + var watermass = 2 * Constants.ProtonMass + PeriodicTable.GetElement("O").PrincipalIsotope.AtomicMass; + + Assert.That(normalFragment.MonoisotopicMass, Is.EqualTo(waterLossFragment.MonoisotopicMass + watermass).Within(0.01)); + } + } + } +} diff --git a/mzLib/Test/Transcriptomics/TestRnase.cs b/mzLib/Test/Transcriptomics/TestRnase.cs index db7d3e3dc..b122f32bd 100644 --- a/mzLib/Test/Transcriptomics/TestRnase.cs +++ b/mzLib/Test/Transcriptomics/TestRnase.cs @@ -1,5 +1,4 @@ using NUnit.Framework; -using Assert = NUnit.Framework.Legacy.ClassicAssert; using System.Diagnostics.CodeAnalysis; using System.IO; using Proteomics.ProteolyticDigestion; @@ -8,7 +7,7 @@ namespace Test.Transcriptomics { [ExcludeFromCodeCoverage] - internal class TestRnase + public class TestRnase { public static string rnaseTsvpath = Path.Combine(TestContext.CurrentContext.TestDirectory, @"Digestion\rnases.tsv"); @@ -16,7 +15,7 @@ internal class TestRnase public void TestRnaseDictionaryLoading() { var rnaseCountFromTsv = File.ReadAllLines(rnaseTsvpath).Length - 1; - Assert.AreEqual(RnaseDictionary.Dictionary.Count, rnaseCountFromTsv); + Assert.That(RnaseDictionary.Dictionary.Count, Is.EqualTo(rnaseCountFromTsv)); } [Test] diff --git a/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs b/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs new file mode 100644 index 000000000..cd6301a91 --- /dev/null +++ b/mzLib/TestFlashLFQ/ChromatographicPeakTests.cs @@ -0,0 +1,67 @@ +using FlashLFQ; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace TestFlashLFQ +{ + public class ChromatographicPeakTests + { + private ChromatographicPeak CreateChromatographicPeak() + { + // Create a sample SpectraFileInfo + SpectraFileInfo spectraFileInfo = new SpectraFileInfo("sampleFile", "A", 1, 1, 1); + + // Create a sample Identification + Identification identification = new Identification(spectraFileInfo, "MPEPTIDE", "M[Oxidation]PEPTIDE", 100, 10, 2, new List()); + + // Create a ChromatographicPeak instance + ChromatographicPeak chromatographicPeak = new ChromatographicPeak(identification, false, spectraFileInfo); + + IndexedMassSpectralPeak peak1 = new IndexedMassSpectralPeak(100, 300, 1, 9.5); + IndexedMassSpectralPeak peak2 = new IndexedMassSpectralPeak(100, 300, 1, 10.5); + + // Add sample IsotopicEnvelopes + chromatographicPeak.IsotopicEnvelopes = new List() + { + new IsotopicEnvelope(peak1, 2, 300, 1), + new IsotopicEnvelope(peak2, 2, 300, 1) + }; + + return chromatographicPeak; + } + + + [Test] + public void TestResolveIdentifications() + { + // Arrange + ChromatographicPeak chromatographicPeak = CreateChromatographicPeak(); + + // Act + chromatographicPeak.ResolveIdentifications(); + + // Assert + Assert.AreEqual(1, chromatographicPeak.NumIdentificationsByBaseSeq); + Assert.AreEqual(1, chromatographicPeak.NumIdentificationsByFullSeq); + } + + [Test] + public void TestToString() + { + // Arrange + ChromatographicPeak chromatographicPeak = CreateChromatographicPeak(); + + // Act + string result = chromatographicPeak.ToString(); + + // Assert + string expected = "sampleFile\tMPEPTIDE\tM[Oxidation]PEPTIDE\t\t\t100\t10\t2\t51.007276466879\t0\t-\t-\t-\t-\t-\t0\tMSMS\t\t\t1\t1\t1\t0\tNaN\tFalse\tFalse"; + Assert.AreEqual(expected, result); + } + } +} diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index 6b8be0325..2a9703d95 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -17,6 +17,7 @@ using ChromatographicPeak = FlashLFQ.ChromatographicPeak; using Stopwatch = System.Diagnostics.Stopwatch; using TopDownProteomics; +using System.Data.Entity.Core.Metadata.Edm; namespace Test { @@ -427,12 +428,18 @@ public static void TestFlashLfqNormalization() var id3 = new Identification(mzml, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List { pg }); var id4 = new Identification(mzml2, "EGFQVADGPLYR", "EGFQVADGPLYR", 1350.65681, 94.12193, 2, new List { pg }); - results = new FlashLfqEngine(new List { id1, id2, id3, id4 }, normalize: true).Run(); + results = new FlashLfqEngine(new List { id1, id2, id3, id4 }, normalize: true, integrate: false).Run(); int int7 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw2)); int int8 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml2)); Assert.That(int7 > 0); Assert.That(int7 == int8); + + results.ReNormalizeResults(true); + int int9 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(raw2)); + int int10 = (int)System.Math.Round(results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml) + results.PeptideModifiedSequences["EGFQVADGPLYR"].GetIntensity(mzml2)); + Assert.That(int9 > int7); + Assert.That(int9, Is.EqualTo(int10).Within(1)); } [Test] @@ -487,15 +494,15 @@ public static void TestFlashLfqMergeResults() public static void TestFlashLfqMatchBetweenRuns() { List filesToWrite = new List { "mzml_1", "mzml_2" }; - List pepSequences = new List - { - "PEPTIDE", - "PEPTIDEV", - "PEPTIDEVV", + List pepSequences = new List + { + "PEPTIDE", + "PEPTIDEV", + "PEPTIDEVV", "TARGETPEP", "PEPTIDEVVV", - "PEPTIDEVVVV", - "PEPTIDEVVVVA", + "PEPTIDEVVVV", + "PEPTIDEVVVVA", "PEPTIDEVVVVAA" }; double intensity = 1e6; @@ -593,9 +600,6 @@ public static void TestFlashLfqMatchBetweenRuns() FlashLfqEngine engine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true); FlashLfqEngine interquartileEngine = new FlashLfqEngine( new List { id1, id2, id3, id4, id5, id11, id12, id6, id7, id9, id10, id13, id14 }, matchBetweenRuns: true); - FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true, - peptideSequencesToUse: pepSequences); - //run the engine var results = engine.Run(); @@ -628,14 +632,14 @@ public static void TestFlashLfqMatchBetweenRuns() rtDiffs.Add(Math.Abs(file1Rt[i] - file2Rt[i])); } + FlashLfqEngine engineAmbiguous = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id18, id15, id16, id17 }, matchBetweenRuns: true, peptideSequencesToQuantify: pepSequences, donorCriterion: DonorCriterion.Intensity); // The ambiguous engine tests that a non-confident ID (i.e., a PSM that didn't make the peptide level fdr cutoff) - // gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwriteen by confident MS2 ids + // gets overwritten by a MBR transfer of a confident ID, and that non-confident IDs are overwritten by confident MS2 ids results = engineAmbiguous.Run(); Assert.False(results.PeptideModifiedSequences.Select(kvp => kvp.Key).Contains("DECOYPEP")); Assert.False(results.Peaks[file1].Any(peak => peak.Identifications.Any(id => id.ModifiedSequence.Contains("DECOYPEP")))); Assert.That(results.Peaks[file2].Any(peak => peak.Identifications.First().ModifiedSequence == "TARGETPEP")); Assert.AreEqual(results.Peaks[file2].Count(peak => peak.IsMbrPeak), 2); - } [Test] @@ -1036,7 +1040,7 @@ public static void TestMatchBetweenRunsWithNoIdsInCommon() FlashLfqEngine engine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10 }, matchBetweenRuns: true); var results = engine.Run(); - // no assertions - just don't crash + Assert.Pass();// no assertions - just don't crash } [Test] @@ -1207,7 +1211,11 @@ public static void TestFlashLfqQoutputRealData() } } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, useSharedPeptidesForProteinQuant: true, maxThreads: -1); + var engine = new FlashLfqEngine(ids, + matchBetweenRuns: true, + requireMsmsIdInCondition: false, + useSharedPeptidesForProteinQuant: true, + maxThreads: -1); var results = engine.Run(); results.WriteResults(Path.Combine(outputDirectory,"peaks.tsv"), Path.Combine(outputDirectory, "peptides.tsv"), Path.Combine(outputDirectory, "proteins.tsv"), Path.Combine(outputDirectory, "bayesian.tsv"),true); @@ -1215,12 +1223,19 @@ public static void TestFlashLfqQoutputRealData() var peaks = results.Peaks.Values.ToList(); var peptides = results.PeptideModifiedSequences.Values.ToList(); var proteins = results.ProteinGroups.Values.ToList(); + var modInfo = results.ModInfo; + + Assert.AreEqual(6989789.488346225, peptides[0].GetTotalIntensity(), 0.0000001); + Assert.AreEqual(726036.539062, peptides[4].GetTotalIntensity(), 0.000001); + Assert.AreEqual(726036.539062, modInfo["Q7KZF4"].Proteins["Q7KZF4"].Peptides["EYGMIYLGK"].ModifiedAminoAcidPositions[4]["Common Variable:Oxidation on M"].Intensity, 0.000001); + Assert.AreEqual(modInfo["Q7KZF4"].Proteins["Q7KZF4"].Peptides["EYGMIYLGK"].Intensity, modInfo["Q7KZF4"].Proteins["Q7KZF4"].Peptides["EYGMIYLGK"].ModifiedAminoAcidPositions[4]["Common Variable:Oxidation on M"].Intensity, 0.000001); + Assert.AreEqual(4, peaks[0].Count(m => m.IsMbrPeak == false)); Assert.AreEqual(5, peaks[1].Count(m => m.IsMbrPeak == false)); - CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "Q7KZF4", "P52298", "Q15149" }, peaks[0].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); - CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q7KZF4", "Q7KZF4", "P52298" }, peaks[1].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "Q7KZF4", "P52298", "Q15149", "Q15149" }, peaks[0].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q15149", "Q7KZF4", "Q7KZF4", "P52298" }, peaks[1].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); Assert.AreEqual(6, peptides.Count); CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q15149", "Q7KZF4", "P52298" }, peptides.Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); @@ -1344,6 +1359,7 @@ public static void RealDataMbrTest() double rt = double.Parse(split[2]); int z = (int)double.Parse(split[6]); var proteins = split[24].Split(new char[] { '|' }); + bool decoyPeptide = split[39].Equals("D"); List proteinGroups = new List(); foreach (var protein in proteins) { @@ -1358,66 +1374,62 @@ public static void RealDataMbrTest() } } - Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups); + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups, decoy: decoyPeptide); ids.Add(id); } - var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 5); + var engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: false, maxThreads: 1, matchBetweenRunsFdrThreshold: 0.15, maxMbrWindow: 1); var results = engine.Run(); + // Count the number of MBR results in each file var f1r1MbrResults = results .PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MBR && p.Value.GetDetectionType(f1r2) == DetectionType.MSMS).ToList(); - - Assert.That(f1r1MbrResults.Count >= 132); - - var f1r2MbrResults = results.PeptideModifiedSequences - .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MSMS && p.Value.GetDetectionType(f1r2) == DetectionType.MBR).ToList(); - - Assert.GreaterOrEqual(f1r2MbrResults.Count, 77); - - List<(double, double)> peptideIntensities = new List<(double, double)>(); - - foreach (var peptide in f1r1MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(f1r1)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(f1r2)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } + .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MBR && p.Value.GetDetectionType(f1r2) == DetectionType.MSMS) + .ToList(); + var f1r2MbrResults = results + .PeptideModifiedSequences + .Where(p => p.Value.GetDetectionType(f1r1) == DetectionType.MSMS && p.Value.GetDetectionType(f1r2) == DetectionType.MBR) + .ToList(); - double corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - Assert.Greater(corr, 0.8); + // Due to the small number of results in the test data, the counts and correlation values can be quite variable. + // Any change to ML.NET or the PEP Analysis engine will cause these to change. + Console.WriteLine("r1 PIP event count: " + f1r1MbrResults.Count); + Console.WriteLine("r2 PIP event count: " + f1r2MbrResults.Count); + Assert.AreEqual(138, f1r1MbrResults.Count); + Assert.AreEqual(70, f1r2MbrResults.Count); - peptideIntensities.Clear(); - foreach (var peptide in f1r2MbrResults) - { - double mbrIntensity = Math.Log(peptide.Value.GetIntensity(f1r2)); - double msmsIntensity = Math.Log(peptide.Value.GetIntensity(f1r1)); - peptideIntensities.Add((mbrIntensity, msmsIntensity)); - } + // Check that MS/MS identified peaks and MBR identified peaks have similar intensities + List<(double, double)> peptideIntensities = f1r1MbrResults.Select(pep => (Math.Log(pep.Value.GetIntensity(f1r1)), Math.Log(pep.Value.GetIntensity(f1r2)))).ToList(); + double corrRun1 = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - corr = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); + peptideIntensities = f1r2MbrResults.Select(pep => (Math.Log(pep.Value.GetIntensity(f1r1)), Math.Log(pep.Value.GetIntensity(f1r2)))).ToList(); + double corrRun2 = Correlation.Pearson(peptideIntensities.Select(p => p.Item1), peptideIntensities.Select(p => p.Item2)); - // Update means more MBR-detections, which decreases the correlation slightly. Will increase again when we begin filtering based on MBR score - Assert.Greater(corr, 0.69); + // These values are also sensitive, changes can cause them to dip as low as 0.6 (specifically the corrRun2 value) + Console.WriteLine("r1 correlation: " + corrRun1); + Console.WriteLine("r2 correlation: " + corrRun2); + Assert.Greater(corrRun1, 0.75); + Assert.Greater(corrRun2, 0.65); // the "requireMsmsIdInCondition" field requires that at least one MS/MS identification from a protein // has to be observed in a condition for match-between-runs f1r1.Condition = "b"; engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 5); results = engine.Run(); - var proteinsObservedInF1 = ids.Where(p => p.FileInfo == f1r1).SelectMany(p => p.ProteinGroups).Distinct().ToList(); - var proteinsObservedInF2 = ids.Where(p => p.FileInfo == f1r2).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF1 = ids.Where(id => !id.IsDecoy).Where(p => p.FileInfo == f1r1).SelectMany(p => p.ProteinGroups).Distinct().ToList(); + var proteinsObservedInF2 = ids.Where(id => !id.IsDecoy).Where(p => p.FileInfo == f1r2).SelectMany(p => p.ProteinGroups).Distinct().ToList(); var proteinsObservedInF1ButNotF2 = proteinsObservedInF1.Except(proteinsObservedInF2).ToList(); foreach (ProteinGroup protein in proteinsObservedInF1ButNotF2) { Assert.That(results.ProteinGroups[protein.ProteinGroupName].GetIntensity(f1r2) == 0); } - List peptidesToUse = ids.Select(id => id.ModifiedSequence).Take(400).Distinct().ToList(); - engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, peptideSequencesToUse: peptidesToUse); + // Test that no decoys are reported in the final resultsw + Assert.AreEqual(0, ids.Where(id => id.IsDecoy).Count(id => results.ProteinGroups.ContainsKey(id.ProteinGroups.First().ProteinGroupName))); + + List peptidesToUse = ids.Where(id => id.QValue <= 0.007 & !id.IsDecoy).Select(id => id.ModifiedSequence).Distinct().ToList(); + engine = new FlashLfqEngine(ids, matchBetweenRuns: true, requireMsmsIdInCondition: true, maxThreads: 1, matchBetweenRunsFdrThreshold: 0.5, maxMbrWindow: 1, peptideSequencesToQuantify: peptidesToUse); results = engine.Run(); - var test = results.PeptideModifiedSequences.Select(kvp => !peptidesToUse.Contains(kvp.Key)).ToList(); CollectionAssert.AreEquivalent(results.PeptideModifiedSequences.Select(kvp => kvp.Key), peptidesToUse); } @@ -1489,7 +1501,7 @@ public static void ProteoformPeakfindingTest() Assert.That((int)results.PeptideModifiedSequences[sequence].GetIntensity(file1) == 1386491); ChromatographicPeak peak = results.Peaks[file1].First(p => p.Identifications.First().ModifiedSequence == sequence); - Assert.That(Math.Round(peak.MassError, 3) == 0); + Assert.That(Math.Round(peak.MassError, 3), Is.EqualTo(0)); Assert.That(peak.IsotopicEnvelopes.Count == 10); } @@ -1652,14 +1664,13 @@ public static void TestAmbiguousFraction() peak1.ResolveIdentifications(); peak2.ResolveIdentifications(); - peak1.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 1000)); - peak2.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 10000)); + peak1.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 1000, 1)); + peak2.IsotopicEnvelopes.Add(new FlashLFQ.IsotopicEnvelope(new IndexedMassSpectralPeak(0, 0, 0, 0), 1, 10000, 1)); peak1.CalculateIntensityForThisFeature(false); peak2.CalculateIntensityForThisFeature(false); - FlashLfqResults res = new FlashLfqResults(new List { fraction1, fraction2 }, new List { id1, id2, id3 }, - new HashSet { "peptide1", "peptide2"}); + FlashLfqResults res = new FlashLfqResults(new List { fraction1, fraction2 }, new List { id1, id2, id3 }); res.Peaks[fraction1].Add(peak1); res.Peaks[fraction2].Add(peak2); res.CalculatePeptideResults(quantifyAmbiguousPeptides: false); diff --git a/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs b/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs index 72461fa9c..cd6412731 100644 --- a/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs +++ b/mzLib/TestFlashLFQ/TestIdentificationAdapter.cs @@ -62,7 +62,7 @@ public void TestFileNametoFilePath(string path) fullFilePath.Add(fullFilePath1); fullFilePath.Add(fullFilePath2); - Dictionary allFiles = file.FileNametoFilePath(fullFilePath); + Dictionary allFiles = file.FileNameToFilePath(fullFilePath); Assert.That(allFiles.TryGetValue(fileName, out var output)); Assert.AreEqual(output, fullFilePath1); @@ -81,7 +81,7 @@ public void TestFileNametoFilePathLocalPath(string path) string rawFilePath = @"DataFiles\SmallCalibratibleYeast.mzml"; fullFilePath.Add(rawFilePath); - Dictionary allFiles = file.FileNametoFilePath(fullFilePath); + Dictionary allFiles = file.FileNameToFilePath(fullFilePath); Assert.That(allFiles.TryGetValue(fileName, out var output)); Assert.AreEqual(output, rawFilePath); diff --git a/mzLib/TestFlashLFQ/TestPipEcho.cs b/mzLib/TestFlashLFQ/TestPipEcho.cs new file mode 100644 index 000000000..0d2388142 --- /dev/null +++ b/mzLib/TestFlashLFQ/TestPipEcho.cs @@ -0,0 +1,313 @@ +using NUnit.Framework; +using Readers; +using System.Collections.Generic; +using System.Linq; +using FlashLFQ; +using Assert = NUnit.Framework.Legacy.ClassicAssert; +using System.IO; +using FlashLFQ.PEP; +using System; +using Chemistry; +using MassSpectrometry; +using MzLibUtil; +using Test.FileReadingTests; +using UsefulProteomicsDatabases; + + +namespace TestFlashLFQ +{ + [TestFixture] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] + public class TestPipEcho + { + [Test] + [TestCase(3)] + [TestCase(5)] + public static void TestDonorGroupEqualizer(int numGroups) + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + + ChromatographicPeak targetPeak = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + ChromatographicPeak decoyPeak = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: true); + targetPeak.MbrScore = 100; + + Random random = new Random(42); + List donorGroups = new List(); + for (int i = 0; i < 10000; i++) + { + int numberTargets = random.Next(0, 10); + int numberDecoys = random.Next(0, 10); + donorGroups.Add(new DonorGroup(id, Enumerable.Repeat(targetPeak, numberTargets).ToList(), Enumerable.Repeat(decoyPeak, numberDecoys).ToList())); + } + + donorGroups = PepAnalysisEngine.OrderDonorGroups(donorGroups); + var donorIndices = PepAnalysisEngine.GetDonorGroupIndices(donorGroups, numGroups: numGroups, scoreCutoff: 50); + + Assert.That(donorIndices.Count, Is.EqualTo(numGroups)); + List targetPeakCounts = new(); + List decoyPeakCounts = new(); + for (int i = 0; i < numGroups; i++) + { + int targetSum = 0; + int decoySum = 0; + foreach (int idx in donorIndices[i]) + { + targetSum += donorGroups[idx].TargetAcceptors.Count; + decoySum += donorGroups[idx].DecoyAcceptors.Count; + } + targetPeakCounts.Add(targetSum); + decoyPeakCounts.Add(decoySum); + } + + // Assert that each group has an approximately equal number of target peaks + Assert.That(targetPeakCounts.Max() - targetPeakCounts.Min(), Is.LessThanOrEqualTo(numGroups-1)); + // Assert that each group has an approximately equal number of decoy peaks + Assert.That(decoyPeakCounts.Max() - decoyPeakCounts.Min(), Is.LessThanOrEqualTo(numGroups - 1)); + } + + [Test] + public static void TestMbrScorer() + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + SpectraFileInfo fakeDonorFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + + double idMass = 669.4173; + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification id2 = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification donorId = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + id.PeakfindingMass = idMass; + id2.PeakfindingMass = idMass; + donorId.PeakfindingMass = idMass; + + var peak1 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var peak2 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var peak3 = new ChromatographicPeak(id2, isMbrPeak: false, fakeFile, randomRt: false); + var peak4 = new ChromatographicPeak(id, isMbrPeak: false, fakeFile, randomRt: false); + var donorPeak = new ChromatographicPeak(donorId, isMbrPeak: false, fakeDonorFile, randomRt: false); + var acceptorPeak = new ChromatographicPeak(donorId, isMbrPeak: true, fakeFile, randomRt: false); + + IndexedMassSpectralPeak imsPeak = new IndexedMassSpectralPeak((idMass + 0.001).ToMz(1), 1.1, 1, 25); + IndexedMassSpectralPeak imsPeak2 = new IndexedMassSpectralPeak((idMass - 0.001).ToMz(1), 1, 2, 26); + var iso1 = new FlashLFQ.IsotopicEnvelope(imsPeak, 1, 1, 0.98); + var iso2 = new FlashLFQ.IsotopicEnvelope(imsPeak2, 1, 1, 0.9); + + peak1.IsotopicEnvelopes.Add(iso1); + peak1.IsotopicEnvelopes.Add(iso2); + peak1.CalculateIntensityForThisFeature(false); + + peak4.IsotopicEnvelopes.Add(iso2); + peak4.CalculateIntensityForThisFeature(false); + + donorPeak.IsotopicEnvelopes.Add(iso2); + donorPeak.CalculateIntensityForThisFeature(false); + + acceptorPeak.IsotopicEnvelopes.Add(iso1); + acceptorPeak.CalculateIntensityForThisFeature(false); + + + var peakList = new List { peak1, peak4 }; + var peakDict = peakList.ToDictionary(keySelector: p => p.Apex.IndexedPeak, elementSelector: p => p); + + // Builds a scorer. Ppm Error and Intensity distributions both have mean and std-dev of 1 + MbrScorer scorer = new MbrScorer(peakDict, peakList, new MathNet.Numerics.Distributions.Normal(1, 1), new MathNet.Numerics.Distributions.Normal(1,1)); + + scorer.AddRtPredErrorDistribution(fakeDonorFile, new List { 0.5, 0.6, 0.5, 0.6, 0.5, 0.6, 0.5 }, 2); + + acceptorPeak.MbrScore = scorer.ScoreMbr(acceptorPeak, donorPeak, predictedRt: 25.1); + + Assert.That(acceptorPeak.MbrScore, Is.EqualTo(58.7).Within(0.1)); + Assert.That(acceptorPeak.PpmScore, Is.EqualTo(0.62).Within(0.01)); + Assert.That(acceptorPeak.IntensityScore, Is.EqualTo(0.32).Within(0.01)); + Assert.That(acceptorPeak.RtScore, Is.EqualTo(0.96).Within(0.01)); + Assert.That(acceptorPeak.ScanCountScore, Is.EqualTo(0.5).Within(0.01)); + Assert.That(acceptorPeak.IsotopicDistributionScore, Is.EqualTo(0.74).Within(0.01)); + } + + [Test] + public static void TestSpectraFileInfoString() + { + SpectraFileInfo fakeFile = new SpectraFileInfo(@"C:\Users\xyz\data\fakeFile.raw", "A", 1, 1, 1); + Assert.AreEqual("fakeFile.raw", fakeFile.ToString()); + } + + [Test] + public static void TestChromatographicPeakEquals() + { + SpectraFileInfo fakeFile = new SpectraFileInfo("fakeFile", "A", 1, 1, 1); + Identification id = new Identification(fakeFile, "KPVGAAK", "KPVGAAK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + Identification id2 = new Identification(fakeFile, "KPVGK", "KPVGK", 669.4173, 1.9398, 2, new List { new ProteinGroup("P16403", "H12", "HUMAN") }); + + var peak1 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + var peak2 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + var peak3 = new ChromatographicPeak(id2, isMbrPeak: true, fakeFile, randomRt: false); + var peak4 = new ChromatographicPeak(id, isMbrPeak: true, fakeFile, randomRt: false); + + IndexedMassSpectralPeak imsPeak = new IndexedMassSpectralPeak(1, 1, 1, 25); + IndexedMassSpectralPeak imsPeak2 = new IndexedMassSpectralPeak(1, 1, 1, 50); + var iso1 = new FlashLFQ.IsotopicEnvelope(imsPeak, 1, 1, 1); + var iso2 = new FlashLFQ.IsotopicEnvelope(imsPeak2, 1, 1, 1); + + peak1.IsotopicEnvelopes.Add(iso1); + peak1.CalculateIntensityForThisFeature(false); + + peak2.IsotopicEnvelopes.Add(iso1); + peak2.CalculateIntensityForThisFeature(false); + + peak3.IsotopicEnvelopes.Add(iso1); + peak3.CalculateIntensityForThisFeature(false); + + peak4.IsotopicEnvelopes.Add(iso2); + peak4.CalculateIntensityForThisFeature(false); + + Assert.That(peak1.Equals(peak2)); + Assert.That(!peak1.Equals(peak3)); + Assert.That(!peak1.Equals(peak4)); + + } + + /// + /// This test MatchBetweenRuns by creating two fake mzML files and a list of fake IDs. + /// There are multiple sets of IDs, where most are shared between the two runs but one+ is/are missing + /// MBR is tested by ensuring that IDs are transferred between runs + /// + [Test] + public static void TestFlashLfqMatchBetweenRunsNearestNeighborDonors() + { + List filesToWrite = new List { "mzml_1", "mzml_2", "mzml_3" }; + List pepSequences = new List + { + "PEPTIDE", + "PEPTIDEV", + "PEPTIDEVV", + "TARGETPEP", + "PEPTIDEVVV", + "PEPTIDEVVVV", + "PEPTIDEVVVVA", + "PEPTIDEVVVVAA" + }; + double intensity = 1e6; + + double[] file1Rt = new double[] { 1.01, 1.02, 1.03, 1.033, 1.035, 1.04, 1.045, 1.05 }; + double[] file2Rt = new double[] { 1.00, 1.025, 1.03, 1.031, 1.035, 1.04, 1.055, 1.07 }; + + Loaders.LoadElements(); + + // generate mzml files (5 peptides each) + for (int f = 0; f < filesToWrite.Count; f++) + { + // 1 MS1 scan per peptide + MsDataScan[] scans = new MsDataScan[8]; + + for (int p = 0; p < pepSequences.Count; p++) + { + ChemicalFormula cf = new Proteomics.AminoAcidPolymer.Peptide(pepSequences[p]).GetChemicalFormula(); + IsotopicDistribution dist = IsotopicDistribution.GetDistribution(cf, 0.125, 1e-8); + double[] mz = dist.Masses.Select(v => v.ToMz(1)).ToArray(); + double[] intensities = dist.Intensities.Select(v => v * intensity).ToArray(); + if(f == 2) + { + // Make file 3 the most intense + intensities = intensities.Select(v => v * 5).ToArray(); + } + double rt; + if (f == 1) + { + rt = file2Rt[p]; + } + else + { + rt = file1Rt[p]; + } + + // add the scan + scans[p] = new MsDataScan(massSpectrum: new MzSpectrum(mz, intensities, false), oneBasedScanNumber: p + 1, msnOrder: 1, isCentroid: true, + polarity: Polarity.Positive, retentionTime: rt, scanWindowRange: new MzRange(400, 1600), scanFilter: "f", + mzAnalyzer: MZAnalyzerType.Orbitrap, totalIonCurrent: intensities.Sum(), injectionTime: 1.0, noiseData: null, nativeId: "scan=" + (p + 1)); + } + + // write the .mzML + Readers.MzmlMethods.CreateAndWriteMyMzmlWithCalibratedSpectra(new FakeMsDataFile(scans), + Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[f] + ".mzML"), false); + } + + // set up spectra file info + SpectraFileInfo file1 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[0] + ".mzML"), "a", 0, 0, 0); + SpectraFileInfo file2 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[1] + ".mzML"), "a", 1, 0, 0); + SpectraFileInfo file3 = new SpectraFileInfo(Path.Combine(TestContext.CurrentContext.TestDirectory, filesToWrite[2] + ".mzML"), "a", 2, 0, 0); + + // create some PSMs + var pg = new ProteinGroup("MyProtein", "gene", "org"); + Identification id1 = new Identification(file1, "PEPTIDE", "PEPTIDE", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file1Rt[0] + 0.001, 1, new List { pg }); + Identification id2 = new Identification(file1, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file1Rt[1] + 0.001, 1, new List { pg }); + Identification id3 = new Identification(file1, "PEPTIDEVV", "PEPTIDEVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List { pg }); + Identification id4 = new Identification(file1, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List { pg }); + Identification id5 = new Identification(file1, "PEPTIDEVVVV", "PEPTIDEVVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file1Rt[5] + 0.001, 1, new List { pg }); + + Identification id6 = new Identification(file2, "PEPTIDE", "PEPTIDE", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDE").MonoisotopicMass, file2Rt[0] + 0.001, 1, new List { pg }); + Identification id7 = new Identification(file2, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file2Rt[1] + 0.001, 1, new List { pg }); + // missing ID 8 - MBR feature - "PEPTIDEVV" + + Identification id9 = new Identification(file2, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file2Rt[4] + 0.001, 1, new List { pg }); + Identification id10 = new Identification(file2, "PEPTIDEVVVV", "PEPTIDEVVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVVV").MonoisotopicMass, file2Rt[5] + 0.001, 1, new List { pg }); + + + Identification id11 = new Identification(file3, "PEPTIDEV", "PEPTIDEV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEV").MonoisotopicMass, file1Rt[1] + 0.001, 1, new List { pg }); // same as peak 2 + Identification id12 = new Identification(file3, "PEPTIDEVV", "PEPTIDEVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVV").MonoisotopicMass, file1Rt[2] + 0.001, 1, new List { pg }); // same as peak 3, but higher intensity + Identification id13 = new Identification(file3, "PEPTIDEVVV", "PEPTIDEVVV", + new Proteomics.AminoAcidPolymer.Peptide("PEPTIDEVVV").MonoisotopicMass, file1Rt[4] + 0.001, 1, new List { pg }); // same as peak 4 + + + // create the FlashLFQ engine + FlashLfqEngine neighborsEngine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id11, id12, id13 }, + matchBetweenRuns: true, donorCriterion: DonorCriterion.Neighbors); + + //run the engine + var results = neighborsEngine.Run(); + + Assert.That(results.Peaks[file2].Count == 5); + Assert.That(results.Peaks[file2].Where(p => p.IsMbrPeak).Count() == 1); + + var peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); + var otherFilePeak = results.Peaks[file1].Where(p => p.Identifications.First().BaseSequence == + peak.Identifications.First().BaseSequence).First(); + + + Assert.That(peak.Intensity > 0); + Assert.That(peak.Intensity == otherFilePeak.Intensity); + Assert.That(peak.Identifications.First().FileInfo == file1); // assure that the ID came from file 1, ie, the donor with the most neighboring peaks + + // create the FlashLFQ engine + FlashLfqEngine intensityEngine = new FlashLfqEngine(new List { id1, id2, id3, id4, id5, id6, id7, id9, id10, id11, id12, id13 }, + matchBetweenRuns: true, donorCriterion: DonorCriterion.Intensity); + + //run the engine + results = intensityEngine.Run(); + + Assert.That(results.Peaks[file2].Count == 5); + Assert.That(results.Peaks[file2].Where(p => p.IsMbrPeak).Count() == 1); + + peak = results.Peaks[file2].Where(p => p.IsMbrPeak).First(); + otherFilePeak = results.Peaks[file3].Where(p => p.Identifications.First().BaseSequence == + peak.Identifications.First().BaseSequence).First(); + + + Assert.That(peak.Intensity > 0); + Assert.That(peak.Intensity, Is.EqualTo(otherFilePeak.Intensity/5).Within(1)); // file 3 is five times more intense than file 2 + Assert.That(peak.Identifications.First().FileInfo == file3); // assure that the ID came from file 3, ie, the most intense donor peaks + + } + + } +} diff --git a/mzLib/Transcriptomics/ClassExtensions.cs b/mzLib/Transcriptomics/ClassExtensions.cs new file mode 100644 index 000000000..ef56c737d --- /dev/null +++ b/mzLib/Transcriptomics/ClassExtensions.cs @@ -0,0 +1,119 @@ +using Omics.Modifications; +using System.Text; +using Transcriptomics.Digestion; + +namespace Transcriptomics +{ + public static class ClassExtensions + { + /// + /// Creates a new instance of a nucleic acid or oligo with set modifications, optionally updating its sequence, modifications, and decoy status. + /// + /// The type of the nucleic acid, which must implement . + /// The target nucleic acid or oligo with set modifications to base the new instance on. + /// The new sequence string, if any. If null, the original sequence is used. + /// A dictionary of modifications to apply, if any. If null, the original modifications are used. + /// A flag indicating whether the sequence is a decoy, if any. If null, the original decoy status is used. + /// A new instance of the specified nucleic acid type with the provided or existing properties. + /// + /// This method facilitates the generation of new sequences for both nucleic acids and oligos with set modifications by allowing + /// optional updates to the sequence string, modifications, and decoy status. It ensures that the new instances are properly + /// initialized with the provided or existing properties, enabling further analysis of modified sequences and future generation of decoys on the fly. + /// + public static T CreateNew(this T target, string? sequence = null, IDictionary>? modifications = null, + bool? isDecoy = null) + where T : INucleicAcid + { + // set new object parameters where not null + object? returnObj = null; + string newSequence = sequence ?? target.BaseSequence; + IDictionary> newModifications = modifications ?? target.OneBasedPossibleLocalizedModifications; + + switch (target) + { + case RNA rna: + { + bool newIsDecoy = isDecoy ?? rna.IsDecoy; + returnObj = new RNA(newSequence, rna.Name, rna.Accession, rna.Organism, rna.DatabaseFilePath, + rna.FivePrimeTerminus, rna.ThreePrimeTerminus, newModifications, rna.IsContaminant, newIsDecoy, rna.GeneNames.ToList(), rna.AdditionalDatabaseFields); + break; + } + case OligoWithSetMods oligo: + { + var oldParent = oligo.Parent as RNA ?? throw new NullReferenceException(); + bool newIsDecoy = isDecoy ?? oldParent.IsDecoy; + var newParent = new RNA( + newSequence, + oldParent.Name, + oldParent.Accession, + oldParent.Organism, + oldParent.DatabaseFilePath, + oldParent.FivePrimeTerminus, + oldParent.ThreePrimeTerminus, + newModifications, + oldParent.IsContaminant, + newIsDecoy, + oldParent.GeneNames.ToList(), + oldParent.AdditionalDatabaseFields); + + returnObj = new OligoWithSetMods( + newParent, + (oligo.DigestionParams as RnaDigestionParams)!, + oligo.OneBasedStartResidue, + oligo.OneBasedEndResidue, + oligo.MissedCleavages, + oligo.CleavageSpecificityForFdrCategory, + newModifications.ToDictionary(p => p.Key, p => p.Value.First()), + oligo.NumFixedMods, + oligo.FivePrimeTerminus, + oligo.ThreePrimeTerminus); + break; + } + default: + throw new ArgumentException("INucleicAcid type not yet implemented"); + } + + return (T)returnObj ?? throw new NullReferenceException("Error creating new INucleicAcid"); + } + + /// + /// Transcribes a DNA sequence into an RNA sequence + /// + /// The input dna sequence + /// True if the input sequence is the coding strand, False if the input sequence is the template strand + /// + public static string Transcribe(this string dna, bool isCodingStrand = true) + { + var sb = new StringBuilder(); + foreach (var residue in dna) + { + if (isCodingStrand) + { + sb.Append(residue == 'T' ? 'U' : residue); + } + else + { + switch (residue) + { + case 'A': + sb.Append('U'); + break; + case 'T': + sb.Append('A'); + break; + case 'C': + sb.Append('G'); + break; + case 'G': + sb.Append('C'); + break; + default: + sb.Append(residue); + break; + } + } + } + return sb.ToString(); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs new file mode 100644 index 000000000..767fb1564 --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/NucleolyticOligo.cs @@ -0,0 +1,86 @@ +using Chemistry; +using Omics.Digestion; +using Omics.Modifications; + +namespace Transcriptomics.Digestion +{ + /// + /// The most basic form of a digested oligo, this class does not care about mass or formula, just base sequence + /// + public class NucleolyticOligo : DigestionProduct + { + protected IHasChemicalFormula _fivePrimeTerminus; + protected IHasChemicalFormula _threePrimeTerminus; + + internal NucleolyticOligo(NucleicAcid nucleicAcid, int oneBaseStartResidue, + int oneBasedEndResidue, int missedCleavages, CleavageSpecificity cleavageSpecificity, + IHasChemicalFormula? fivePrimeTerminus, IHasChemicalFormula? threePrimeTerminus) + : base(nucleicAcid, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, cleavageSpecificity) + { + _fivePrimeTerminus = fivePrimeTerminus ?? NucleicAcid.DefaultFivePrimeTerminus; + _threePrimeTerminus = threePrimeTerminus ?? NucleicAcid.DefaultThreePrimeTerminus; + } + + /// + /// Nucleic acid this oligo was digested from + /// + public NucleicAcid NucleicAcid + { + get => Parent as NucleicAcid; + protected set => Parent = value; + } + + public override string ToString() + { + return BaseSequence; + } + + /// + /// Generates a collection of oligos with set modifications based on the provided fixed and variable modifications, + /// digestion parameters, and the nucleic acid sequence. + /// + /// A collection of all known fixed modifications. + /// Parameters for RNA digestion. + /// A list of variable modifications to consider. + /// An enumerable collection of oligos with set modifications. + /// + /// Code heavily borrowed from ProteolyticPeptide.GetModifiedPeptides + /// + internal IEnumerable GenerateModifiedOligos(List allKnownFixedMods, + RnaDigestionParams digestionParams, List variableModifications) + { + int variableModificationIsoforms = 0; + int oligoLength = OneBasedEndResidue - OneBasedStartResidue + 1; + int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; + int maxModsForOligo = digestionParams.MaxMods; + var twoBasedPossibleVariableAndLocalizeableModifications = DictionaryPool.Get(); + var fixedModDictionary = FixedModDictionaryPool.Get(); + + try + { + PopulateVariableModifications(variableModifications, in twoBasedPossibleVariableAndLocalizeableModifications); + PopulateFixedModsOneIsNorFivePrimeTerminus(oligoLength, allKnownFixedMods, in fixedModDictionary); + + // Add the mods to the oligo by return numerous OligoWithSetMods + foreach (Dictionary variableModPattern in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForOligo, oligoLength)) + { + AppendFixedModificationsToVariable(in fixedModDictionary, in variableModPattern, out int numFixedMods); + + yield return new OligoWithSetMods(NucleicAcid, digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, + CleavageSpecificityForFdrCategory, variableModPattern, numFixedMods, _fivePrimeTerminus, _threePrimeTerminus); + + variableModificationIsoforms++; + if (variableModificationIsoforms == maximumVariableModificationIsoforms) + { + yield break; + } + } + } + finally + { + DictionaryPool.Return(twoBasedPossibleVariableAndLocalizeableModifications); + FixedModDictionaryPool.Return(fixedModDictionary); + } + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs new file mode 100644 index 000000000..6455e209f --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/OligoWithSetMods.cs @@ -0,0 +1,442 @@ +using Chemistry; +using MassSpectrometry; +using Omics.Digestion; +using Omics.Fragmentation; +using Omics.Modifications; +using Omics; +using Easy.Common.Extensions; +using Omics.Fragmentation.Oligo; +using System.Text; + +namespace Transcriptomics.Digestion +{ + /// + /// Represents an oligonucleotide with set modifications, providing properties and methods for + /// accessing and manipulating its chemical characteristics. + /// + /// + /// The monoisotopic mass, most abundant mass, and chemical formula are calculated on the fly if the corresponding properties + /// (_monoisotopicMass, _thisChemicalFormula, _mostAbundantMonoisotopicMass) are null. This ensures that the most up-to-date values are + /// always available based on the current state of the oligonucleotide and its modifications. Therefor, it is important to set those + /// properties to null whenever a termini or modification is changed. + /// + public class OligoWithSetMods : NucleolyticOligo, IBioPolymerWithSetMods, INucleicAcid, IEquatable + { + public OligoWithSetMods(NucleicAcid nucleicAcid, RnaDigestionParams digestionParams, int oneBaseStartResidue, + int oneBasedEndResidue, int missedCleavages, CleavageSpecificity cleavageSpecificity, + Dictionary allModsOneIsNTerminus, int numFixedMods, IHasChemicalFormula? fivePrimeTerminus = null, + IHasChemicalFormula? threePrimeTerminus = null) + : base(nucleicAcid, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, + cleavageSpecificity, fivePrimeTerminus, threePrimeTerminus) + { + _digestionParams = digestionParams; + _allModsOneIsNterminus = allModsOneIsNTerminus; + NumFixedMods = numFixedMods; + FullSequence = this.DetermineFullSequence(); + } + + public OligoWithSetMods(string sequence, Dictionary allKnownMods, int numFixedMods = 0, + RnaDigestionParams digestionParams = null, NucleicAcid n = null, int oneBaseStartResidue = 1, int oneBasedEndResidue = 0, + int missedCleavages = 0, CleavageSpecificity cleavageSpecificity = CleavageSpecificity.Full, string description = null, + IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null) + : base(n, oneBaseStartResidue, oneBasedEndResidue, missedCleavages, + cleavageSpecificity, fivePrimeTerminus, threePrimeTerminus) + { + if (sequence.Contains("|")) + { + throw new MzLibUtil.MzLibException("Ambiguous oligo cannot be parsed from string: " + sequence); + } + + FullSequence = sequence; + _baseSequence = IBioPolymerWithSetMods.GetBaseSequenceFromFullSequence(sequence); + _allModsOneIsNterminus = GetModsAfterDeserialization(allKnownMods); + NumFixedMods = numFixedMods; + _digestionParams = digestionParams; + Description = description; + + if (n != null) + Parent = n; + } + + private RnaDigestionParams _digestionParams; + private Dictionary _allModsOneIsNterminus; + private double? _monoisotopicMass; + private ChemicalFormula? _thisChemicalFormula; + private double? _mostAbundantMonoisotopicMass; + private IDictionary>? _oneBasedPossibleLocalizedModifications; + private string? _sequenceWithChemicalFormula; + + public string FullSequence { get; private set; } + public IDigestionParams DigestionParams => _digestionParams; + public IHasChemicalFormula FivePrimeTerminus + { + get => _fivePrimeTerminus; + set + { + _fivePrimeTerminus = value; + _monoisotopicMass = null; + _thisChemicalFormula = null; + _mostAbundantMonoisotopicMass = null; + } + } + + public IHasChemicalFormula ThreePrimeTerminus + { + get => _threePrimeTerminus; + set + { + _threePrimeTerminus = value; + _monoisotopicMass = null; + _thisChemicalFormula = null; + _mostAbundantMonoisotopicMass = null; + } + } + + public double MonoisotopicMass + { + get + { + _monoisotopicMass ??= BaseSequence.Sum(nuc => Nucleotide.GetResidue(nuc).MonoisotopicMass) + + AllModsOneIsNterminus.Values.Sum(mod => mod.MonoisotopicMass!.Value) + + FivePrimeTerminus.MonoisotopicMass + + ThreePrimeTerminus.MonoisotopicMass; + return _monoisotopicMass.Value; + } + } + + public ChemicalFormula ThisChemicalFormula + { + get + { + if (_thisChemicalFormula is not null) return _thisChemicalFormula!; + + var fullFormula = new RNA(BaseSequence, FivePrimeTerminus, ThreePrimeTerminus).GetChemicalFormula(); + foreach (var mod in AllModsOneIsNterminus.Values) + { + if (mod.ChemicalFormula is null) + { + fullFormula = null; + break; + } + fullFormula.Add(mod.ChemicalFormula); + } + _thisChemicalFormula = fullFormula; + return _thisChemicalFormula!; + } + } + + public double MostAbundantMonoisotopicMass + { + get + { + if (_mostAbundantMonoisotopicMass is not null) return _mostAbundantMonoisotopicMass.Value; + + var distribution = IsotopicDistribution.GetDistribution(ThisChemicalFormula); + double maxIntensity = distribution.Intensities.Max(); + _mostAbundantMonoisotopicMass = distribution.Masses[distribution.Intensities.IndexOf(maxIntensity)].RoundedDouble(); + return _mostAbundantMonoisotopicMass!.Value; + } + } + + public string SequenceWithChemicalFormulas + { + get + { + if (_sequenceWithChemicalFormula is not null) return _sequenceWithChemicalFormula; + + var subsequence = new StringBuilder(); + // variable modification on peptide N-terminus + if (AllModsOneIsNterminus.TryGetValue(1, out Modification? pepNTermVariableMod)) + { + if (pepNTermVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + for (int r = 0; r < Length; r++) + { + subsequence.Append(this[r]); + // variable modification on this residue + if (!AllModsOneIsNterminus.TryGetValue(r + 2, out Modification? residueVariableMod)) continue; + if (residueVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + // variable modification on peptide C-terminus + if (AllModsOneIsNterminus.TryGetValue(Length + 2, out Modification? pepCTermVariableMod)) + { + if (pepCTermVariableMod is { } mod) + subsequence.Append('[' + mod.ChemicalFormula.Formula + ']'); + } + + _sequenceWithChemicalFormula = subsequence.ToString(); + return _sequenceWithChemicalFormula; + } + } + + public Dictionary AllModsOneIsNterminus => _allModsOneIsNterminus; + + public IDictionary> OneBasedPossibleLocalizedModifications => _oneBasedPossibleLocalizedModifications ??= + _allModsOneIsNterminus.ToDictionary(p => p.Key, p => new List() { p.Value }); + public int NumMods => AllModsOneIsNterminus.Count; + public int NumFixedMods { get; } + public int NumVariableMods => NumMods - NumFixedMods; + + /// + /// Generates theoretical fragments for given dissociation type for this peptide. + /// The "products" parameter is filled with these fragments. + /// + public void Fragment(DissociationType dissociationType, FragmentationTerminus fragmentationTerminus, + List products) + { + products.Clear(); + + List fivePrimeProductTypes = + dissociationType.GetRnaTerminusSpecificProductTypesFromDissociation(FragmentationTerminus.FivePrime); + List threePrimeProductTypes = + dissociationType.GetRnaTerminusSpecificProductTypesFromDissociation(FragmentationTerminus.ThreePrime); + + bool calculateFivePrime = + fragmentationTerminus is FragmentationTerminus.FivePrime or FragmentationTerminus.Both; + bool calculateThreePrime = + fragmentationTerminus is FragmentationTerminus.ThreePrime or FragmentationTerminus.Both; + + var sequence = (Parent as NucleicAcid)!.NucleicAcidArray[(OneBasedStartResidue - 1)..OneBasedEndResidue]; + + // intact product ion + if (fragmentationTerminus is FragmentationTerminus.Both or FragmentationTerminus.None) + products.AddRange(GetNeutralFragments(ProductType.M, sequence)); + + if (calculateFivePrime) + foreach (var type in fivePrimeProductTypes) + products.AddRange(GetNeutralFragments(type, sequence)); + + if (calculateThreePrime) + foreach (var type in threePrimeProductTypes) + products.AddRange(GetNeutralFragments(type, sequence)); + } + + #region IEquatable + + /// + /// Oligos are equal if they have the same full sequence, parent, and digestion agent, and terminal caps + /// + public override bool Equals(object? obj) + { + if (obj is OligoWithSetMods oligo) + { + return Equals(oligo); + } + return false; + } + + /// + /// Oligos are equal if they have the same full sequence, parent, and digestion agent, and terminal caps + /// + public bool Equals(IBioPolymerWithSetMods? other) => Equals(other as OligoWithSetMods); + + /// + /// Oligos are equal if they have the same full sequence, parent, and digestion agent, and terminal caps + /// + public bool Equals(OligoWithSetMods? other) + { + if (other is null) return false; + if (ReferenceEquals(this, other)) return true; + if (other.GetType() != GetType()) return false; + + // for those constructed from sequence and mods only + if (Parent is null && other.Parent is null) + return FullSequence.Equals(other.FullSequence); + + return FullSequence == other.FullSequence + && Equals(DigestionParams?.DigestionAgent, other.DigestionParams?.DigestionAgent) + && _fivePrimeTerminus.Equals(other._fivePrimeTerminus) + && _threePrimeTerminus.Equals(other._threePrimeTerminus) + // These last two are important for parsimony in MetaMorpheus + && OneBasedStartResidue == other!.OneBasedStartResidue + && Equals(Parent?.Accession, other.Parent?.Accession); + } + + public override int GetHashCode() + { + var hash = new HashCode(); + hash.Add(FullSequence); + hash.Add(OneBasedStartResidue); + if (Parent?.Accession != null) + { + hash.Add(Parent.Accession); + } + if (DigestionParams?.DigestionAgent != null) + { + hash.Add(DigestionParams.DigestionAgent); + } + hash.Add(FivePrimeTerminus); + hash.Add(ThreePrimeTerminus); + return hash.ToHashCode(); + } + + #endregion + + /// + /// Generates theoretical internal fragments for given dissociation type for this peptide. + /// The "products" parameter is filled with these fragments. + /// The "minLengthOfFragments" parameter is the minimum number of nucleic acids for an internal fragment to be included + /// + public void FragmentInternally(DissociationType dissociationType, int minLengthOfFragments, + List products) + { + throw new NotImplementedException(); + } + + /// + /// Calculates all the fragments of the types you specify + /// + /// product type to get neutral fragments from + /// Sequence to generate fragments from, will be calculated from the parent if left null + /// + public IEnumerable GetNeutralFragments(ProductType type, Nucleotide[]? sequence = null) + { + sequence ??= (Parent as NucleicAcid)!.NucleicAcidArray[(OneBasedStartResidue - 1)..OneBasedEndResidue]; + + if (type is ProductType.M) + { + yield return new Product(type, FragmentationTerminus.None, MonoisotopicMass, 0, 0, 0); + yield break; + } + + // determine mass of piece remaining after fragmentation + double monoMass = type.GetRnaMassShiftFromProductType(); + + // determine mass of terminal cap and add to fragment + bool isThreePrimeTerminal = type.GetRnaTerminusType() == FragmentationTerminus.ThreePrime; + IHasChemicalFormula terminus = isThreePrimeTerminal ? ThreePrimeTerminus : FivePrimeTerminus; + monoMass += terminus.MonoisotopicMass; + + // determine mass of each polymer component that is contained within the fragment and add to fragment + bool first = true; //set first to true to hand the terminus mod first + for (int i = 0; i <= BaseSequence.Length - 1; i++) + { + int naIndex = isThreePrimeTerminal ? Length - i : i - 1; + if (first) + { + first = false; //set to false so only handled once + continue; + } + monoMass += sequence[naIndex].MonoisotopicMass; + + if (i < 1) + continue; + + // add side-chain mod + if (AllModsOneIsNterminus.TryGetValue(naIndex + 2, out Modification mod)) + { + monoMass += mod.MonoisotopicMass ?? 0; + } + + var previousNucleotide = sequence[naIndex]; + + double neutralLoss = 0; + if (type.ToString().Contains("Base")) + { + neutralLoss = previousNucleotide.BaseChemicalFormula.MonoisotopicMass; + } + + yield return new Product(type, + isThreePrimeTerminal ? FragmentationTerminus.ThreePrime : FragmentationTerminus.FivePrime, + monoMass - neutralLoss, i, + isThreePrimeTerminal ? BaseSequence.Length - i : i, 0, null, 0); + } + } + + /// + /// Outputs a duplicate IBioPolymerWithSetMods with a localized mass shift, replacing a modification when present + /// + /// Used to localize an unknown mass shift in the MetaMorpheus Localization Engine + /// + /// + /// The index of the modification in the AllModOneIsNTerminus Dictionary - 2 (idk why -2) + /// The mass to add to the BioPolymer + public IBioPolymerWithSetMods Localize(int indexOfMass, double massToLocalize) + { + var dictWithLocalizedMass = new Dictionary(AllModsOneIsNterminus); + double massOfExistingMod = 0; + if (dictWithLocalizedMass.TryGetValue(indexOfMass + 2, out Modification modToReplace)) + { + massOfExistingMod = (double)modToReplace.MonoisotopicMass; + dictWithLocalizedMass.Remove(indexOfMass + 2); + } + + dictWithLocalizedMass.Add(indexOfMass + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); + + var peptideWithLocalizedMass = new OligoWithSetMods(NucleicAcid, _digestionParams, OneBasedStartResidue, OneBasedEndResidue, MissedCleavages, + CleavageSpecificityForFdrCategory, dictWithLocalizedMass, NumFixedMods, FivePrimeTerminus, ThreePrimeTerminus); + + return peptideWithLocalizedMass; + } + + private Dictionary GetModsAfterDeserialization(Dictionary idToMod) + { + var mods = new Dictionary(); + int currentModStart = 0; + int currentModificationLocation = 1; + bool currentlyReadingMod = false; + int bracketCount = 0; + + for (int r = 0; r < FullSequence.Length; r++) + { + char c = FullSequence[r]; + if (c == '[') + { + currentlyReadingMod = true; + if (bracketCount == 0) + { + currentModStart = r + 1; + } + + bracketCount++; + } + else if (c == ']') + { + string modId = null; + bracketCount--; + if (bracketCount == 0) + { + try + { + //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") + string modString = FullSequence.Substring(currentModStart, r - currentModStart); + int splitIndex = modString.IndexOf(':'); + string modType = modString.Substring(0, splitIndex); + modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); + } + catch (Exception e) + { + throw new MzLibUtil.MzLibException( + "Error while trying to parse string into peptide: " + e.Message, e); + } + + if (!idToMod.TryGetValue(modId, out Modification mod)) + { + throw new MzLibUtil.MzLibException( + "Could not find modification while reading string: " + FullSequence); + } + + if (mod.LocationRestriction.Contains("3'-terminal.") && r == FullSequence.Length - 1) + { + currentModificationLocation = BaseSequence.Length + 2; + } + + mods.Add(currentModificationLocation, mod); + currentlyReadingMod = false; + } + } + else if (!currentlyReadingMod) + { + currentModificationLocation++; + } + //else do nothing + } + + return mods; + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs b/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs new file mode 100644 index 000000000..fb80a1a0b --- /dev/null +++ b/mzLib/Transcriptomics/Digestion/RnaDigestionParams.cs @@ -0,0 +1,45 @@ +using Omics.Digestion; +using Omics.Fragmentation; + +namespace Transcriptomics.Digestion +{ + public class RnaDigestionParams : IDigestionParams + { + + // this parameterless constructor needs to exist to read the toml. + public RnaDigestionParams() : this("top-down") + { + } + + public RnaDigestionParams(string rnase = "top-down", int maxMissedCleavages = 0, int minLength = 3, + int maxLength = int.MaxValue, int maxModificationIsoforms = 1024, int maxMods = 2, + FragmentationTerminus fragmentationTerminus = FragmentationTerminus.Both) + { + Rnase = RnaseDictionary.Dictionary[rnase]; + MaxMissedCleavages = maxMissedCleavages; + MinLength = minLength; + MaxLength = maxLength; + MaxMods = maxMods; + MaxModificationIsoforms = maxModificationIsoforms; + FragmentationTerminus = fragmentationTerminus; + } + + public int MaxMissedCleavages { get; set; } + public int MinLength { get; set; } + public int MaxLength { get; set; } + public int MaxModificationIsoforms { get; set; } + public int MaxMods { get; set; } + public DigestionAgent DigestionAgent => Rnase; + public Rnase Rnase { get; private set; } + public FragmentationTerminus FragmentationTerminus { get; set; } + public CleavageSpecificity SearchModeType { get; set; } = CleavageSpecificity.Full; + public IDigestionParams Clone(FragmentationTerminus? newTerminus = null) + { + return newTerminus.HasValue + ? new RnaDigestionParams(Rnase.Name, MaxMissedCleavages, MinLength, MaxLength, + MaxModificationIsoforms, MaxMods, newTerminus.Value) + : new RnaDigestionParams(Rnase.Name, MaxMissedCleavages, MinLength, MaxLength, + MaxModificationIsoforms, MaxMods, FragmentationTerminus); + } + } +} diff --git a/mzLib/Transcriptomics/Digestion/Rnase.cs b/mzLib/Transcriptomics/Digestion/Rnase.cs index 646bbc8d1..79beb5821 100644 --- a/mzLib/Transcriptomics/Digestion/Rnase.cs +++ b/mzLib/Transcriptomics/Digestion/Rnase.cs @@ -1,4 +1,5 @@ -using Omics.Digestion; +using Chemistry; +using Omics.Digestion; using Omics.Modifications; namespace Transcriptomics.Digestion @@ -8,15 +9,63 @@ public class Rnase : DigestionAgent, IEquatable public Rnase(string name, CleavageSpecificity cleaveSpecificity, List motifList, Modification cleavageMod = null) : base(name, cleaveSpecificity, motifList, cleavageMod) { - Name = name; CleavageSpecificity = cleaveSpecificity; DigestionMotifs = motifList; } - // TODO: Coming soon to a mzLib near you - // public List GetUnmodifiedOligos(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, int maxLength) - // private IEnumerable FullDigestion(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, int maxLength) - + public List GetUnmodifiedOligos(NucleicAcid nucleicAcid, int maxMissedCleavages, int minLength, + int maxLength) + { + var oligos = new List(); + + // top down + if (CleavageSpecificity == CleavageSpecificity.None) + { + if (ValidLength(nucleicAcid.Length, minLength, maxLength)) + oligos.Add(new NucleolyticOligo(nucleicAcid, 1, nucleicAcid.Length, + 0, CleavageSpecificity.Full, nucleicAcid.FivePrimeTerminus, nucleicAcid.ThreePrimeTerminus)); + } + // full cleavage + else if (CleavageSpecificity == CleavageSpecificity.Full) + { + oligos.AddRange(FullDigestion(nucleicAcid, maxMissedCleavages, minLength, maxLength)); + } + else + { + throw new ArgumentException( + "Cleave Specificity not defined for Rna digestion, currently supports Full and None"); + } + + return oligos; + } + + private IEnumerable FullDigestion(NucleicAcid nucleicAcid, int maxMissedCleavages, + int minLength, int maxLength) + { + List oneBasedIndicesToCleaveAfter = GetDigestionSiteIndices(nucleicAcid.BaseSequence); + for (int missedCleavages = 0; missedCleavages <= maxMissedCleavages; missedCleavages++) + { + for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - missedCleavages - 1; i++) + { + if (ValidLength(oneBasedIndicesToCleaveAfter[i + missedCleavages + 1] - oneBasedIndicesToCleaveAfter[i], + minLength, maxLength)) + { + int oneBasedStartResidue = oneBasedIndicesToCleaveAfter[i] + 1; + int oneBasedEndResidue = oneBasedIndicesToCleaveAfter[i + missedCleavages + 1]; + + // contains original 5' terminus ? keep it : set to OH + IHasChemicalFormula fivePrimeTerminus = oneBasedStartResidue == 1 ? nucleicAcid.FivePrimeTerminus : ChemicalFormula.ParseFormula("O-3P-1"); + + // contains original 3' terminus ? keep it : set to phosphate + IHasChemicalFormula threePrimeTerminus = oneBasedEndResidue == nucleicAcid.Length ? nucleicAcid.ThreePrimeTerminus : ChemicalFormula.ParseFormula("H2O4P"); + + yield return new NucleolyticOligo(nucleicAcid, oneBasedStartResidue, oneBasedEndResidue, + missedCleavages, CleavageSpecificity.Full, fivePrimeTerminus, threePrimeTerminus); + } + } + } + } + public bool Equals(Rnase? other) { if (ReferenceEquals(null, other)) return false; diff --git a/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs b/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs index 3d55d2ef4..4e3e95e4d 100644 --- a/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs +++ b/mzLib/Transcriptomics/Interfaces/INucleicAcid.cs @@ -1,10 +1,9 @@ using Chemistry; -using Omics; using Omics.Modifications; namespace Transcriptomics { - public interface INucleicAcid : IHasChemicalFormula, IBioPolymer + public interface INucleicAcid : IHasChemicalFormula { /// /// The amino acid sequence diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs new file mode 100644 index 000000000..fad764816 --- /dev/null +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -0,0 +1,349 @@ +using Chemistry; +using Omics.Digestion; +using Omics.Modifications; +using Omics; +using System.Text; +using MzLibUtil; +using Transcriptomics.Digestion; + +namespace Transcriptomics +{ + /// + /// A linear polymer of Nucleic acids + /// + public abstract class NucleicAcid : INucleicAcid, IBioPolymer, IEquatable + { + #region Static Properties + + /// + /// The default chemical formula of the five prime (hydroxyl group) + /// + /// + /// This means that the five prime cap will remove the excess components of first nucleotides + /// phospho group, leaving only the hydroxyl. This formula will be used for the five prime cap, unless + /// the nucleic acid is constructed with a different chemical formula + /// + public static readonly ChemicalFormula DefaultFivePrimeTerminus = ChemicalFormula.ParseFormula("O-3P-1"); + + /// + /// The default chemical formula of the three prime terminus (hydroxyl group) + /// + /// + /// This is used to account for the mass of the additional hydroxyl group at the three end of most oligonucleotides. + /// This formula will be used for the three prime cap, unless the nucleic acid is constructed with a different + /// chemical formula + /// + public static readonly ChemicalFormula DefaultThreePrimeTerminus = ChemicalFormula.ParseFormula("OH"); + + #endregion + + #region Constuctors + + /// + /// For creating an RNA programatically + /// + protected NucleicAcid(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null) + { + MonoisotopicMass = 0; + _nucleicAcids = new Nucleotide[sequence.Length]; + ThreePrimeTerminus = threePrimeTerm ??= DefaultThreePrimeTerminus; + FivePrimeTerminus = fivePrimeTerm ??= DefaultFivePrimeTerminus; + _oneBasedPossibleLocalizedModifications = oneBasedPossibleLocalizedModifications ?? new Dictionary>(); + GeneNames = new List>(); + + ParseSequenceString(sequence); + } + + /// + /// For Reading in from rna database + /// + protected NucleicAcid(string sequence, string name, string identifier, string organism, string databaseFilePath, + IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null, + bool isContaminant = false, bool isDecoy = false, List>? geneNames = null, + Dictionary? additionalDatabaseFields = null) + : this(sequence, fivePrimeTerm, threePrimeTerm, oneBasedPossibleLocalizedModifications) + { + Name = name; + DatabaseFilePath = databaseFilePath; + IsDecoy = isDecoy; + IsContaminant = isContaminant; + Organism = organism; + Accession = identifier; + AdditionalDatabaseFields = additionalDatabaseFields; + GeneNames = geneNames ?? new List>(); + } + + #endregion + + #region Private Properties + + /// + /// The 5-Prime chemical formula cap + /// + private IHasChemicalFormula _5PrimeTerminus; + + /// + /// The 3-Prime chemical formula cap + /// + private IHasChemicalFormula _3PrimeTerminus; + + /// + /// All of the nucleic acid residues indexed by position from 5- to 3-prime. + /// + private Nucleotide[] _nucleicAcids; + + /// + /// The nucleic acid sequence. Is ignored if 'StoreSequenceString' is false + /// + private string _sequence; + + private IDictionary> _oneBasedPossibleLocalizedModifications; + + #endregion + + #region Public Properties + + /// + /// Gets or sets the 5' terminus of this nucleic acid polymer + /// + public IHasChemicalFormula FivePrimeTerminus + { + get => _5PrimeTerminus; + set => ReplaceTerminus(ref _5PrimeTerminus, value); + } + + /// + /// Gets or sets the 3' terminus of this nucleic acid polymer + /// + public IHasChemicalFormula ThreePrimeTerminus + { + get => _3PrimeTerminus; + set => ReplaceTerminus(ref _3PrimeTerminus, value); + } + + /// + /// Gets the number of nucleic acids in this nucleic acid polymer + /// + public int Length => BaseSequence.Length; + + public string Name { get; } + public string FullName => Name; // TODO: Consider if this needs to be different from the name + public string DatabaseFilePath { get; } + public bool IsDecoy { get; } + public bool IsContaminant { get; } + public string Accession { get; } + + public IDictionary> OneBasedPossibleLocalizedModifications => _oneBasedPossibleLocalizedModifications; + public string Organism { get; } + + /// + /// The list of gene names consists of tuples, where Item1 is the type of gene name, and Item2 is the name. There may be many genes and names of a certain type produced when reading an XML protein database. + /// + public IEnumerable> GeneNames { get; } + public Dictionary? AdditionalDatabaseFields { get; } + + /// + /// The total monoisotopic mass of this peptide and all of its modifications + /// + public double MonoisotopicMass { get; private set; } + + /// + /// Returns a copy of the nucleic acid array, used for -base mass calculations. + /// + public Nucleotide[] NucleicAcidArray => _nucleicAcids; + + public ChemicalFormula ThisChemicalFormula => GetChemicalFormula(); + + #endregion + + #region Nucleic Acid Sequence + + /// + /// Gets the base nucleic acid sequence + /// + public string BaseSequence + { + get + { + // Generate the sequence if the stored version is null or empty + if (string.IsNullOrEmpty(_sequence)) + { + _sequence = new string(_nucleicAcids.Select(na => na.Letter).ToArray()); + } + + return _sequence; + } + } + + public char this[int zeroBasedIndex] => BaseSequence[zeroBasedIndex]; + + #endregion + + #region Digestion + + public IEnumerable Digest(IDigestionParams digestionParameters, List allKnownFixedMods, + List variableModifications, List silacLabels = null, (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, + bool topDownTruncationSearch = false) + { + if (digestionParameters is not RnaDigestionParams digestionParams) + throw new MzLibException( + "DigestionParameters must be of type DigestionParams for protein digestion", new ArgumentException()); + allKnownFixedMods ??= new(); + variableModifications ??= new(); + + // digest based upon base sequence + foreach (var unmodifiedOligo in digestionParams.Rnase.GetUnmodifiedOligos(this, + digestionParams.MaxMissedCleavages, digestionParams.MinLength, digestionParams.MaxLength)) + { + // add fixed and variable mods to base sequence digestion products + foreach (var modifiedOligo in unmodifiedOligo.GenerateModifiedOligos(allKnownFixedMods, digestionParams, + variableModifications)) + { + yield return modifiedOligo; + } + } + } + + public IEnumerable Digest(RnaDigestionParams digestionParameters, + List allKnownFixedMods, + List variableModifications, List silacLabels = null, + (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, + bool topDownTruncationSearch = false) + { + return Digest((IDigestionParams)digestionParameters, allKnownFixedMods, variableModifications, silacLabels, turnoverLabels, topDownTruncationSearch) + .Cast(); + } + + #endregion + + #region Electrospray + + public IEnumerable GetElectrospraySeries(int minCharge, int maxCharge) + { + if (minCharge > maxCharge) + (minCharge, maxCharge) = (maxCharge, minCharge); + + for (int i = maxCharge; i > minCharge - 1; i--) + yield return this.ToMz(i); + } + + #endregion + + #region Chemical Formula + + public ChemicalFormula GetChemicalFormula() + { + var formula = new ChemicalFormula(); + + // Handle 5'-Terminus + formula.Add(FivePrimeTerminus.ThisChemicalFormula); + + // Handle 3'-Terminus + formula.Add(ThreePrimeTerminus.ThisChemicalFormula); + + // Handle Nucleic Acid Residues + for (int i = 0; i < Length; i++) + { + formula.Add(_nucleicAcids[i].ThisChemicalFormula); + } + + return formula; + } + + #endregion + + #region Private Methods + + private void ReplaceTerminus(ref IHasChemicalFormula? terminus, IHasChemicalFormula? value) + { + if (Equals(value, terminus)) + return; + + if (terminus != null) + MonoisotopicMass -= terminus.MonoisotopicMass; + + terminus = value; + + if (value != null) + MonoisotopicMass += value.MonoisotopicMass; + } + + /// + /// Parses a string sequence of nucleic acid characters into an array of Nucleotide objects, + /// updates the sequence string, and calculates the monoisotopic mass. + /// + /// The string sequence of nucleic acid characters to parse. + private void ParseSequenceString(string sequence) + { + if (string.IsNullOrEmpty(sequence)) + return; + + int index = 0; + double monoMass = 0; + + StringBuilder sb = null; + sb = new StringBuilder(sequence.Length); + + foreach (char letter in sequence) + { + Nucleotide residue; + if (Nucleotide.TryGetResidue(letter, out residue)) + { + _nucleicAcids[index++] = residue; + sb.Append(residue.Letter); + monoMass += residue.MonoisotopicMass; + } + else + { + switch (letter) + { + case ' ': // ignore spaces + break; + + case '*': // ignore * + break; + + default: + throw new ArgumentException(string.Format( + "Nucleic Acid Letter {0} does not exist in the Nucleic Acid Dictionary. {0} is also not a valid character", + letter)); + } + } + } + + _sequence = sb.ToString(); + MonoisotopicMass += monoMass; + Array.Resize(ref _nucleicAcids, Length); + } + + #endregion + + #region Interface Implemntations and Overrides + + public bool Equals(NucleicAcid? other) + { + // interface equals first because it does null and reference checks + return (this as IBioPolymer).Equals(other) + && _5PrimeTerminus.Equals(other._5PrimeTerminus) + && _3PrimeTerminus.Equals(other._3PrimeTerminus); + } + + public override bool Equals(object? obj) + { + if (obj is NucleicAcid oligo) + { + return Equals(oligo); + } + return false; + } + + public override int GetHashCode() + { + return HashCode.Combine(_5PrimeTerminus, _3PrimeTerminus, _sequence); + } + + #endregion + } +} diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs new file mode 100644 index 000000000..a7a5a4803 --- /dev/null +++ b/mzLib/Transcriptomics/RNA.cs @@ -0,0 +1,47 @@ +using Chemistry; +using Omics.Modifications; + +namespace Transcriptomics +{ + public class RNA : NucleicAcid + { + /// + /// For constructing RNA from a string + /// + /// + /// + /// + /// + public RNA(string sequence, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null, + IDictionary>? oneBasedPossibleLocalizedModifications = null) + : base(sequence, fivePrimeTerm, threePrimeTerm, oneBasedPossibleLocalizedModifications) + { + } + + /// + /// For use with RNA loaded from a database + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + public RNA(string sequence, string name, string accession, string organism, string databaseFilePath, + IHasChemicalFormula? fivePrimeTerminus = null, IHasChemicalFormula? threePrimeTerminus = null, + IDictionary>? oneBasedPossibleModifications = null, + bool isContaminant = false, bool isDecoy = false, List> geneNames = null, + Dictionary? databaseAdditionalFields = null) + : base(sequence, name, accession, organism, databaseFilePath, fivePrimeTerminus, threePrimeTerminus, + oneBasedPossibleModifications, isContaminant, isDecoy, geneNames, databaseAdditionalFields) + { + + } + } +} diff --git a/mzLib/UsefulProteomicsDatabases/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyProteinGenerator.cs index 67a442782..07a90b6f0 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyProteinGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyProteinGenerator.cs @@ -180,6 +180,7 @@ private static List GenerateReverseDecoys(List proteins, int m lock (decoyProteins) { decoyProteins.Add(decoyProtein); } }); + decoyProteins = decoyProteins.OrderBy(p => p.Accession).ToList(); return decoyProteins; } @@ -359,6 +360,7 @@ private static List GenerateSlideDecoys(List proteins, int max protein.Name, protein.FullName, true, protein.IsContaminant, null, decoyVariationsSlide, null, protein.SampleNameForVariants, decoy_disulfides_slide, spliceSitesSlide, protein.DatabaseFilePath); lock (decoyProteins) { decoyProteins.Add(decoyProteinSlide); } }); + decoyProteins = decoyProteins.OrderBy(p => p.Accession).ToList(); return decoyProteins; } diff --git a/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs b/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs index b70e3dc23..51978b2db 100644 --- a/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs +++ b/mzLib/UsefulProteomicsDatabases/FastaHeaderFieldRegex.cs @@ -19,5 +19,17 @@ public FastaHeaderFieldRegex(string fieldName, string regularExpression, int mat public int Match { get; } public int Group { get; } + + public string ApplyRegex(string input) + { + string? result = null; + var matches = Regex.Matches(input); + if (matches.Count > Match && matches[Match].Groups.Count > Group) + { + result = matches[Match].Groups[Group].Value; + } + + return result!; + } } } \ No newline at end of file diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index 8544c2233..b5a680a5e 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -402,7 +402,7 @@ public static IEnumerable MergeProteins(IEnumerable mergeThese } } - private static string ApplyRegex(FastaHeaderFieldRegex regex, string line) + internal static string ApplyRegex(FastaHeaderFieldRegex regex, string line) { string result = null; if (regex != null) @@ -416,7 +416,7 @@ private static string ApplyRegex(FastaHeaderFieldRegex regex, string line) return result; } - private static Dictionary> GetModificationDict(IEnumerable mods) + internal static Dictionary> GetModificationDict(IEnumerable mods) { var mod_dict = new Dictionary>(); @@ -436,7 +436,7 @@ private static Dictionary> GetModificationDict(IEnum return mod_dict; } - private static Dictionary GetModificationDictWithMotifs(IEnumerable mods) + internal static Dictionary GetModificationDictWithMotifs(IEnumerable mods) { var mod_dict = new Dictionary(); diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 155945558..049086a9d 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -5,12 +5,287 @@ using System.IO; using System.Linq; using System.Xml; +using Easy.Common.Extensions; +using Omics; using Omics.Modifications; +using Transcriptomics; namespace UsefulProteomicsDatabases { + + /// + /// Provides methods for writing protein and nucleic acid databases to XML and FASTA formats. + /// Did not rename to DbWriter to ensure compatibility with the original UsefulProteomicsDatabases namespace. + /// public class ProteinDbWriter { + /// + /// Writes an XML database for a list of RNA sequences, including additional modifications. + /// + /// A dictionary of additional modifications to add to proteins. + /// A list of RNA sequences to be written to the database. + /// The name of the output XML file. + /// A dictionary of new modification residue entries. + public static Dictionary WriteXmlDatabase( + Dictionary>> additionalModsToAddToProteins, + List bioPolymerList, string outputFileName) => WriteNucleicAcidXmlDatabase(additionalModsToAddToProteins, bioPolymerList.Cast().ToList(), outputFileName); + + /// + /// Writes an XML database for a list of nucleic acid sequences, including additional modifications. + /// + /// A dictionary of additional modifications to add to proteins. + /// A list of nucleic acid sequences to be written to the database. + /// The name of the output XML file. + /// A dictionary of new modification residue entries. + /// + /// Several chunks of code are commented out. These are blocks that are intended to be implmented in the future, but + /// are not necessary for the bare bones implementation of Transcriptomics + /// + private static Dictionary WriteNucleicAcidXmlDatabase( + Dictionary>> additionalModsToAddToProteins, + List nucleicAcidList, string outputFileName) + { + additionalModsToAddToProteins = additionalModsToAddToProteins ?? new Dictionary>>(); + var xmlWriterSettings = new XmlWriterSettings + { + Indent = true, + IndentChars = " " + }; + + Dictionary newModResEntries = new Dictionary(); + using (XmlWriter writer = XmlWriter.Create(outputFileName, xmlWriterSettings)) + { + writer.WriteStartDocument(); + writer.WriteStartElement("mzLibProteinDb"); + + List myModificationList = new List(); + foreach (var p in nucleicAcidList) + { + foreach (KeyValuePair> entry in p.OneBasedPossibleLocalizedModifications) + { + myModificationList.AddRange(entry.Value); + } + } + + // get modifications from nucleic acid list and concatenate the modifications discovered in GPTMDictionary + var allRelevantModifications = + new HashSet(nucleicAcidList + .SelectMany(p => p.OneBasedPossibleLocalizedModifications.SelectMany(m => m.Value)) + .Concat(additionalModsToAddToProteins + .Where(n => nucleicAcidList.Select(nu => nu.Accession).Contains(n.Key)) + .SelectMany(kv => kv.Value.Select(v => v.Item2)))); + + foreach (Modification mod in allRelevantModifications.OrderBy(m => m.IdWithMotif)) + { + writer.WriteStartElement("modification"); + writer.WriteString(mod.ToString() + Environment.NewLine + "//"); + writer.WriteEndElement(); + } + + foreach (var nucleicAcid in nucleicAcidList) + { + writer.WriteStartElement("entry"); + writer.WriteStartElement("accession"); + writer.WriteString(nucleicAcid.Accession); + writer.WriteEndElement(); + + if (nucleicAcid.Name.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("name"); + writer.WriteString(nucleicAcid.Name); + writer.WriteEndElement(); + } + + if (nucleicAcid.FullName.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("protein"); + writer.WriteStartElement("recommendedName"); + writer.WriteStartElement("fullName"); + writer.WriteString(nucleicAcid.FullName); + writer.WriteEndElement(); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + + writer.WriteStartElement("gene"); + foreach (var geneName in nucleicAcid.GeneNames) + { + writer.WriteStartElement("name"); + writer.WriteAttributeString("type", geneName.Item1); + writer.WriteString(geneName.Item2); + writer.WriteEndElement(); + } + writer.WriteEndElement(); + + if (nucleicAcid.Organism.IsNotNullOrEmptyOrWhiteSpace()) + { + writer.WriteStartElement("organism"); + writer.WriteStartElement("name"); + writer.WriteAttributeString("type", "scientific"); + writer.WriteString(nucleicAcid.Organism); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + + //foreach (var dbRef in nucleicAcid) + //{ + // writer.WriteStartElement("dbReference"); + // writer.WriteAttributeString("type", dbRef.Type); + // writer.WriteAttributeString("id", dbRef.Id); + // foreach (Tuple property in dbRef.Properties) + // { + // writer.WriteStartElement("property"); + // writer.WriteAttributeString("type", property.Item1); + // writer.WriteAttributeString("value", property.Item2); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); + //} + + ////for now we are not going to write top-down truncations generated for top-down truncation search. + ////some day we could write those if observed + ////the truncation designation is contained in the "type" field of ProteolysisProduct + //List proteolysisProducts = nucleicAcid.ProteolysisProducts.Where(p => !p.Type.Contains("truncation")).ToList(); + //foreach (var proteolysisProduct in proteolysisProducts) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", proteolysisProduct.Type.Split('(')[0]); + // writer.WriteStartElement("location"); + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", proteolysisProduct.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", proteolysisProduct.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + //} + + foreach (var hm in GetModsForThisBioPolymer(nucleicAcid, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) + { + foreach (var modId in hm.Value) + { + writer.WriteStartElement("feature"); + writer.WriteAttributeString("type", "modified residue"); + writer.WriteAttributeString("description", modId); + writer.WriteStartElement("location"); + writer.WriteStartElement("position"); + writer.WriteAttributeString("position", hm.Key.ToString(CultureInfo.InvariantCulture)); + writer.WriteEndElement(); + writer.WriteEndElement(); + writer.WriteEndElement(); + } + } + + //foreach (var hm in nucleicAcid.SequenceVariations) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "sequence variant"); + // writer.WriteAttributeString("description", hm.Description.ToString()); + // writer.WriteStartElement("original"); + // writer.WriteString(hm.OriginalSequence); + // writer.WriteEndElement(); // original + // writer.WriteStartElement("variation"); + // writer.WriteString(hm.VariantSequence); + // writer.WriteEndElement(); // variation + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // foreach (var hmm in GetModsForThisProtein(nucleicAcid, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) + // { + // foreach (var modId in hmm.Value) + // { + // writer.WriteStartElement("subfeature"); + // writer.WriteAttributeString("type", "modified residue"); + // writer.WriteAttributeString("description", modId); + // writer.WriteStartElement("location"); + // writer.WriteStartElement("subposition"); + // writer.WriteAttributeString("subposition", hmm.Key.ToString(CultureInfo.InvariantCulture)); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // writer.WriteEndElement(); + // } + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + //foreach (var hm in nucleicAcid.DisulfideBonds) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "disulfide bond"); + // writer.WriteAttributeString("description", hm.Description); + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + //foreach (var hm in nucleicAcid.SpliceSites) + //{ + // writer.WriteStartElement("feature"); + // writer.WriteAttributeString("type", "splice site"); + // writer.WriteAttributeString("description", hm.Description); + // writer.WriteStartElement("location"); + // if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition) + // { + // writer.WriteStartElement("position"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // } + // else + // { + // writer.WriteStartElement("begin"); + // writer.WriteAttributeString("position", hm.OneBasedBeginPosition.ToString()); + // writer.WriteEndElement(); + // writer.WriteStartElement("end"); + // writer.WriteAttributeString("position", hm.OneBasedEndPosition.ToString()); + // writer.WriteEndElement(); + // } + // writer.WriteEndElement(); // location + // writer.WriteEndElement(); // feature + //} + + writer.WriteStartElement("sequence"); + writer.WriteAttributeString("length", nucleicAcid.Length.ToString(CultureInfo.InvariantCulture)); + writer.WriteString(nucleicAcid.BaseSequence); + writer.WriteEndElement(); // sequence + writer.WriteEndElement(); // entry + } + + writer.WriteEndElement(); // mzLibProteinDb + writer.WriteEndDocument(); + } + return newModResEntries; + } + /// /// Writes a protein database in mzLibProteinDb format, with additional modifications from the AdditionalModsToAddToProteins list. /// @@ -49,8 +324,17 @@ public static Dictionary WriteXmlDatabase(Dictionary allRelevantModifications = new HashSet( - nonVariantProteins.SelectMany(p => p.SequenceVariations.SelectMany(sv => sv.OneBasedModifications).Concat(p.OneBasedPossibleLocalizedModifications).SelectMany(kv => kv.Value)) - .Concat(additionalModsToAddToProteins.Where(kv => nonVariantProteins.SelectMany(p => p.SequenceVariations.Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession })).Contains(kv.Key)).SelectMany(kv => kv.Value.Select(v => v.Item2)))); + nonVariantProteins + .SelectMany(p => p.SequenceVariations + .SelectMany(sv => sv.OneBasedModifications) + .Concat(p.OneBasedPossibleLocalizedModifications) + .SelectMany(kv => kv.Value)) + .Concat(additionalModsToAddToProteins + .Where(kv => nonVariantProteins + .SelectMany(p => p.SequenceVariations + .Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession })) + .Contains(kv.Key)) + .SelectMany(kv => kv.Value.Select(v => v.Item2)))); foreach (Modification mod in allRelevantModifications.OrderBy(m => m.IdWithMotif)) { @@ -109,7 +393,7 @@ public static Dictionary WriteXmlDatabase(Dictionary property in dbRef.Properties) + foreach (Tuple property in dbRef.Properties.OrderBy(t => t.Item1).ThenBy(t => t.Item2)) { writer.WriteStartElement("property"); writer.WriteAttributeString("type", property.Item1); @@ -122,7 +406,8 @@ public static Dictionary WriteXmlDatabase(Dictionary proteolysisProducts = protein.ProteolysisProducts.Where(p => !p.Type.Contains("truncation")).ToList(); + List proteolysisProducts = protein.ProteolysisProducts.Where(p => !p.Type.Contains("truncation")) + .OrderBy(p => p.OneBasedBeginPosition).ToList(); foreach (var proteolysisProduct in proteolysisProducts) { writer.WriteStartElement("feature"); @@ -138,23 +423,24 @@ public static Dictionary WriteXmlDatabase(Dictionary b.Key)) + foreach (var positionModKvp in GetModsForThisBioPolymer(protein, null, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) { - foreach (var modId in hm.Value) + foreach (var modId in positionModKvp.Value.OrderBy(mod => mod)) { writer.WriteStartElement("feature"); writer.WriteAttributeString("type", "modified residue"); writer.WriteAttributeString("description", modId); writer.WriteStartElement("location"); writer.WriteStartElement("position"); - writer.WriteAttributeString("position", hm.Key.ToString(CultureInfo.InvariantCulture)); + writer.WriteAttributeString("position", positionModKvp.Key.ToString(CultureInfo.InvariantCulture)); writer.WriteEndElement(); writer.WriteEndElement(); writer.WriteEndElement(); } } - foreach (var hm in protein.SequenceVariations) + + foreach (var hm in protein.SequenceVariations.OrderBy(sv => sv.OneBasedBeginPosition).ThenBy(sv => sv.VariantSequence)) { writer.WriteStartElement("feature"); writer.WriteAttributeString("type", "sequence variant"); @@ -181,9 +467,9 @@ public static Dictionary WriteXmlDatabase(Dictionary b.Key)) + foreach (var hmm in GetModsForThisBioPolymer(protein, hm, additionalModsToAddToProteins, newModResEntries).OrderBy(b => b.Key)) { - foreach (var modId in hmm.Value) + foreach (var modId in hmm.Value.OrderBy(mod => mod)) { writer.WriteStartElement("subfeature"); writer.WriteAttributeString("type", "modified residue"); @@ -200,7 +486,7 @@ public static Dictionary WriteXmlDatabase(Dictionary bond.OneBasedBeginPosition)) { writer.WriteStartElement("feature"); writer.WriteAttributeString("type", "disulfide bond"); @@ -225,7 +511,7 @@ public static Dictionary WriteXmlDatabase(Dictionary site.OneBasedBeginPosition)) { writer.WriteStartElement("feature"); writer.WriteAttributeString("type", "splice site"); @@ -263,6 +549,7 @@ public static Dictionary WriteXmlDatabase(Dictionary proteinList, string outputFileName, string delimeter) { using (StreamWriter writer = new StreamWriter(outputFileName)) @@ -276,7 +563,7 @@ public static void WriteFastaDatabase(List proteinList, string outputFi } } - private static Dictionary> GetModsForThisProtein(Protein protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries) + private static Dictionary> GetModsForThisBioPolymer(IBioPolymer protein, SequenceVariation seqvar, Dictionary>> additionalModsToAddToProteins, Dictionary newModResEntries) { var modsToWriteForThisSpecificProtein = new Dictionary>(); @@ -292,7 +579,8 @@ private static Dictionary> GetModsForThisProtein(Protein pr } } - string accession = seqvar == null ? protein.Accession : VariantApplication.GetAccession(protein, new[] { seqvar }); + // This cast to protein is okay as no sequence variation is programmed to RNA as of 9/24/24 + string accession = seqvar == null ? protein.Accession : VariantApplication.GetAccession(protein as Protein, new[] { seqvar }); if (additionalModsToAddToProteins.ContainsKey(accession)) { foreach (var ye in additionalModsToAddToProteins[accession]) diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index a93c896e7..3c583a1a2 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -5,6 +5,8 @@ using System.Text.RegularExpressions; using System.Xml; using Omics.Modifications; +using Transcriptomics; +using UsefulProteomicsDatabases.Transcriptomics; namespace UsefulProteomicsDatabases { @@ -182,6 +184,38 @@ public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExcl return protein; } + internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExclude, + Dictionary unknownModifications, + bool isContaminant, string rnaDbLocation) + { + RNA result = null; + if (xml.Name == "feature") + { + ParseFeatureEndElement(xml, modTypesToExclude, unknownModifications); + } + if (xml.Name == "subfeature") + { + ParseSubFeatureEndElement(xml, modTypesToExclude, unknownModifications); + } + else if (xml.Name == "dbReference") + { + ParseDatabaseReferenceEndElement(xml); + } + else if (xml.Name == "gene") + { + ReadingGene = false; + } + else if (xml.Name == "organism") + { + ReadingOrganism = false; + } + else if (xml.Name == "entry") + { + result = ParseRnaEntryEndElement(xml, isContaminant, rnaDbLocation, modTypesToExclude, unknownModifications); + } + return result; + } + /// /// Finish parsing an entry /// @@ -202,6 +236,24 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr return result; } + internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string rnaDbLocation, + IEnumerable modTypesToExclude, Dictionary unknownModifications) + { + RNA result = null; + if (Accession != null && Sequence != null) + { + // sanitize the sequence to replace unexpected characters with X (unknown amino acid) + // sometimes strange characters get added by RNA sequencing software, etc. + Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X'); + + ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods); + result = new RNA(Sequence, Name, Accession, Organism, rnaDbLocation, null, + null, OneBasedModifications, isContaminant, false, GeneNames, null); + } + Clear(); + return result; + } + /// /// Finish parsing a subfeature element /// @@ -224,6 +276,11 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo FeatureDescription = FeatureDescription.Split(';')[0]; AnnotatedMods.Add((OneBasedFeaturePosition, FeatureDescription)); } + else if (FeatureType == "lipid moiety-binding region") + { + FeatureDescription = FeatureDescription.Split(';')[0]; + AnnotatedMods.Add((OneBasedFeaturePosition, FeatureDescription)); + } else if (FeatureType == "peptide" || FeatureType == "propeptide" || FeatureType == "chain" || FeatureType == "signal peptide") { string type = FeatureType; @@ -304,7 +361,8 @@ private static void ParseAnnotatedMods(Dictionary> desti string annotatedId = annotatedMod.Item2; int annotatedModLocation = annotatedMod.Item1; - if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)) + if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod) + || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod)) { // if the list of known mods contains this IdWithMotif if (!modTypesToExclude.Contains(foundMod.ModificationType)) @@ -322,7 +380,8 @@ private static void ParseAnnotatedMods(Dictionary> desti } // no known mod - try looking it up in the dictionary of mods without motif appended - else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods)) + else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods) + || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods)) { foreach (Modification mod in mods) { @@ -352,19 +411,6 @@ private static void ParseAnnotatedMods(Dictionary> desti } } - private static ModificationMotif GetMotif(string proteinSequence, int position) - { - string aminoAcid = proteinSequence.Substring(position - 1, 1); - if (ModificationMotif.TryGetMotif(aminoAcid, out ModificationMotif motif)) - { - return motif; - } - else - { - return null; - } - } - /// /// Finish parsing a database reference element /// diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs new file mode 100644 index 000000000..2e80c090c --- /dev/null +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs @@ -0,0 +1,261 @@ +using Omics.Modifications; +using System; +using System.Collections.Generic; +using System.IO.Compression; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using System.Xml; +using Chemistry; +using Transcriptomics; + +namespace UsefulProteomicsDatabases.Transcriptomics +{ + public enum RnaFastaHeaderType + { + Modomics, + Unknown, + } + + public static class RnaDbLoader + { + + #region Header Detection and Property Regexes + + public static RnaFastaHeaderType DetectRnaFastaHeaderType(string line) + { + if (line.StartsWith(">id")) + return RnaFastaHeaderType.Modomics; + + return RnaFastaHeaderType.Unknown; + } + + /// + /// Dictionary that extract accession number, species, name, and additional dataField of modomics + /// + public static readonly Dictionary ModomicsFieldRegexes = + new Dictionary() + { + { "Id", new FastaHeaderFieldRegex("Id", @"id:(?.+?)\|", 0, 1) }, + { "Name", new FastaHeaderFieldRegex("Name", @"Name:(?.+?)\|", 0, 1) }, + { "SOterm", new FastaHeaderFieldRegex("SOterm", @"SOterm:(?.+?)\|", 0, 1) }, + { "Type", new FastaHeaderFieldRegex("Type", @"Type:(?.+?)\|", 0, 1) }, + { "Subtype", new FastaHeaderFieldRegex("Subtype", @"Subtype:(?.+?)\|", 0, 1) }, + { "Feature", new FastaHeaderFieldRegex("Feature", @"Feature:(?.+?)\|", 0, 1) }, + { "Organism", new FastaHeaderFieldRegex("Organism", @"Species:(?.+?)$", 0, 1) }, + { "Cellular Localization", new FastaHeaderFieldRegex("CellularLocalization", @"Cellular_Localization:(?.+?)\|", 0, 1) }, + }; + + #endregion + + /// + /// Loads an RNA file from the specified location, optionally generating decoys and adding error tracking + /// + /// The file path to the RNA FASTA database + /// Flag indicating whether to generate targets or not + /// The type of decoy generation to apply + /// Indicates if the RNA sequence is a contaminant + /// Outputs any errors encountered during the process + /// An optional 5' prime chemical modification term + /// An optional 3' prime chemical modification term + /// A list of RNA sequences loaded from the FASTA database + /// Thrown if the FASTA header format is unknown or other issues occur during loading. + + public static List LoadRnaFasta(string rnaDbLocation, bool generateTargets, DecoyType decoyType, + bool isContaminant, out List errors, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null) + { + RnaFastaHeaderType? headerType = null; + Regex substituteWhitespace = new Regex(@"\s+"); + errors = new List(); + List targets = new List(); + string identifierHeader = null; + + string name = null; + string organism = null; + string identifier = null; + + string newDbLocation = rnaDbLocation; + + //we had trouble decompressing and streaming on the fly so we decompress completely first, then stream the file, then delete the decompressed file + if (rnaDbLocation.EndsWith(".gz")) + { + newDbLocation = Path.Combine(Path.GetDirectoryName(rnaDbLocation), "temp.fasta"); + using var stream = new FileStream(rnaDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read); + using FileStream outputFileStream = File.Create(newDbLocation); + using var decompressor = new GZipStream(stream, CompressionMode.Decompress); + decompressor.CopyTo(outputFileStream); + } + + using (var fastaFileStream = new FileStream(newDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + StringBuilder sb = null; + StreamReader fasta = new StreamReader(fastaFileStream); + Dictionary regexResults = new(); + Dictionary regexes = null; + + while (true) + { + string line = ""; + line = fasta.ReadLine(); + if (line == null) { break; } + + if (line.StartsWith(">")) + { + if (headerType is null) + { + headerType = DetectRnaFastaHeaderType(line); + + switch (headerType) + { + case RnaFastaHeaderType.Modomics: + regexes = ModomicsFieldRegexes; + identifierHeader = "SOterm"; + break; + default: + throw new MzLibUtil.MzLibException("Unknown fasta header format: " + line); + } + } + + + regexResults = ParseRegexFields(line, regexes); + name = regexResults["Name"]; + regexResults.Remove("Name"); + organism = regexResults["Organism"]; + regexResults.Remove("Organism"); + identifier = regexResults[identifierHeader]; + regexResults.Remove(identifierHeader); + + sb = new StringBuilder(); + } + else + { + sb?.Append(line.Trim()); + } + + if ((fasta.Peek() == '>' || fasta.Peek() == -1) /*&& accession != null*/ && sb != null) + { + string sequence = substituteWhitespace.Replace(sb.ToString(), ""); + Dictionary additonalDatabaseFields = + regexResults.ToDictionary(x => x.Key, x => x.Value); + + // Do we need to sanitize the sequence? + + RNA rna = new RNA(sequence, name, identifier, organism, rnaDbLocation, + fivePrimeTerm, threePrimeTerm, null, + isContaminant, false, null, additonalDatabaseFields); + if (rna.Length == 0) + errors.Add("Line" + line + ", Rna length of 0: " + rna.Name + "was skipped from database: " + rnaDbLocation); + else + targets.Add(rna); + + name = null; + organism = null; + identifier = null; + regexResults.Clear(); + } + + // no input left + if (fasta.Peek() == -1) + { + break; + } + } + } + + if (newDbLocation != rnaDbLocation) + File.Delete(newDbLocation); + + if (!targets.Any()) + errors.Add("No targets were loaded from database: " + rnaDbLocation); + + List decoys = RnaDecoyGenerator.GenerateDecoys(targets, decoyType); + return generateTargets ? targets.Concat(decoys).ToList() : decoys; + } + + + + private static Dictionary ParseRegexFields(string line, + Dictionary regexes) + { + Dictionary fields = new Dictionary(); + + foreach (var regex in regexes) + { + string match = regex.Value.ApplyRegex(line); + fields.Add(regex.Key, match); + } + + return fields; + } + + public static Dictionary> IdToPossibleMods = new Dictionary>(); + public static Dictionary IdWithMotifToMod = new Dictionary(); + + public static List LoadRnaXML(string rnaDbLocation, bool generateTargets, DecoyType decoyType, + bool isContaminant, IEnumerable allKnownModifications, + IEnumerable modTypesToExclude, out Dictionary unknownModifications, + int maxThreads = 1, IHasChemicalFormula? fivePrimeTerm = null, IHasChemicalFormula? threePrimeTerm = null) + { + var prespecified = ProteinDbLoader.GetPtmListFromProteinXml(rnaDbLocation); + allKnownModifications = allKnownModifications ?? new List(); + modTypesToExclude = modTypesToExclude ?? new List(); + + if (prespecified.Count > 0 || allKnownModifications.Count() > 0) + { + //modsDictionary = GetModificationDict(new HashSet(prespecified.Concat(allKnownModifications))); + IdToPossibleMods = ProteinDbLoader.GetModificationDict(new HashSet(prespecified.Concat(allKnownModifications))); + IdWithMotifToMod = ProteinDbLoader.GetModificationDictWithMotifs(new HashSet(prespecified.Concat(allKnownModifications))); + } + List targets = new List(); + unknownModifications = new Dictionary(); + + string newProteinDbLocation = rnaDbLocation; + + //we had trouble decompressing and streaming on the fly so we decompress completely first, then stream the file, then delete the decompressed file + if (rnaDbLocation.EndsWith(".gz")) + { + newProteinDbLocation = Path.Combine(Path.GetDirectoryName(rnaDbLocation), "temp.xml"); + using var stream = new FileStream(rnaDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read); + using FileStream outputFileStream = File.Create(newProteinDbLocation); + using var decompressor = new GZipStream(stream, CompressionMode.Decompress); + decompressor.CopyTo(outputFileStream); + } + + using (var uniprotXmlFileStream = new FileStream(newProteinDbLocation, FileMode.Open, FileAccess.Read, FileShare.Read)) + { + Regex substituteWhitespace = new Regex(@"\s+"); + + ProteinXmlEntry block = new ProteinXmlEntry(); + + using (XmlReader xml = XmlReader.Create(uniprotXmlFileStream)) + { + while (xml.Read()) + { + if (xml.NodeType == XmlNodeType.Element) + { + block.ParseElement(xml.Name, xml); + } + if (xml.NodeType == XmlNodeType.EndElement || xml.IsEmptyElement) + { + RNA newProtein = block.ParseRnaEndElement(xml, modTypesToExclude, unknownModifications, isContaminant, rnaDbLocation); + if (newProtein != null) + { + targets.Add(newProtein); + } + } + } + } + } + if (newProteinDbLocation != rnaDbLocation) + { + File.Delete(newProteinDbLocation); + } + + List decoys = RnaDecoyGenerator.GenerateDecoys(targets, decoyType, maxThreads); + IEnumerable proteinsToExpand = generateTargets ? targets.Concat(decoys) : decoys; + return proteinsToExpand.ToList(); + } + } +} diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs new file mode 100644 index 000000000..b9cc20e1d --- /dev/null +++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDecoyGenerator.cs @@ -0,0 +1,89 @@ +using Proteomics; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using MassSpectrometry; +using Omics.Modifications; +using Transcriptomics; + +namespace UsefulProteomicsDatabases.Transcriptomics +{ + /// + /// Provides methods for generating decoy nucleic acids from any implementor of . + /// + /// + /// This class supports various types of decoy generation, including reversing, sliding, and shuffling sequences. + /// It allows for the creation of decoy sequences while preserving certain characteristics such as modification sites and termini. + /// The GenerateDecoys method serves as the main entry point, delegating to specific decoy generation methods based on the specified . + /// TODO: Implement Shuffle and Slide Decoys + /// TODO: Consider passing digestion motif as optional parameter to leave digestion sites intact. Currently leaving the 3' intact as it is the predominant cleavage motif. + /// TODO: Consider palindromic sequences and the result they have on fragment ions (d/z are identical, c/y are identical). This will be particularly important for slided decoys + /// + public static class RnaDecoyGenerator + { + public static List GenerateDecoys(List nucleicAcids, DecoyType decoyType, int maxThreads = -1) where T : INucleicAcid + { + switch (decoyType) + { + case DecoyType.None: + return new List(); + case DecoyType.Reverse: + return GenerateReverseDecoys(nucleicAcids, maxThreads); + case DecoyType.Slide: + return GenerateSlidedDecoys(nucleicAcids, maxThreads); + case DecoyType.Shuffle: + return GenerateShuffledDeocys(nucleicAcids, maxThreads); + case DecoyType.Random: + default: + throw new ArgumentOutOfRangeException(nameof(decoyType), decoyType, null); + } + } + + /// + /// Generated decoys in which the sequence is reversed, + /// leaving modification on their nucleic acid of origin, + /// and 3' termini intact as it is the most likely cleavage site. + /// + /// + /// + /// + private static List GenerateReverseDecoys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + List decoyNucleicAcids = new List(); + Parallel.ForEach(nucleicAcids, new ParallelOptions() { MaxDegreeOfParallelism = maxThreads }, nucleicAcid => + { + // reverse sequence + var reverseSequence = + new string(nucleicAcid.BaseSequence[..^1].Reverse().Append(nucleicAcid.BaseSequence.Last()).ToArray()); + + // reverse modifications + var reverseModifications = new Dictionary>(); + foreach (var kvp in nucleicAcid.OneBasedPossibleLocalizedModifications) + { + var reverseKey = kvp.Key == reverseSequence.Length ? kvp.Key : reverseSequence.Length - kvp.Key; + reverseModifications.Add(reverseKey, kvp.Value); + } + + T newNucleicAcid = nucleicAcid.CreateNew(reverseSequence, reverseModifications, true); + lock (decoyNucleicAcids) + { + decoyNucleicAcids.Add(newNucleicAcid); + } + }); + return decoyNucleicAcids; + } + + private static List GenerateSlidedDecoys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + throw new NotImplementedException(); + } + + private static List GenerateShuffledDeocys(List nucleicAcids, int maxThreads = -1) where T : INucleicAcid + { + throw new NotImplementedException(); + } + + } +} diff --git a/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj b/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj index d4e73fa42..f057fc396 100644 --- a/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj +++ b/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj @@ -19,6 +19,7 @@ + diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 626ecf12f..d23d69f82 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -16,22 +16,32 @@ + + + + + + + + + + @@ -42,6 +52,10 @@ + + + + @@ -55,6 +69,10 @@ + + + + @@ -64,6 +82,10 @@ + + + + @@ -77,6 +99,10 @@ + + + + diff --git a/mzLib/mzLib.sln.DotSettings b/mzLib/mzLib.sln.DotSettings index 78477fa52..69660cbfe 100644 --- a/mzLib/mzLib.sln.DotSettings +++ b/mzLib/mzLib.sln.DotSettings @@ -1,11 +1,18 @@ - + + True True True True True True + True + True + True + True True + True True True + True True True \ No newline at end of file