Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fragger hyper score #2407

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
bd5aefc
update mzlib nuget package to 551
trishorts Aug 26, 2024
0ff7b36
add hyperscore to PEP calculation
trishorts Aug 29, 2024
b5a8be8
update from master
trishorts Aug 29, 2024
929c326
pio
trishorts Aug 29, 2024
fa16571
lets see
trishorts Aug 29, 2024
0c3f584
piu
trishorts Sep 3, 2024
e822a72
Merge remote-tracking branch 'upstream/master' into addFraggerHyperScore
trishorts Sep 9, 2024
bf7b078
length specific hyperscore
trishorts Sep 9, 2024
d08d03d
length 150
trishorts Sep 9, 2024
7a8a303
sfad
trishorts Sep 10, 2024
79282d4
works
trishorts Sep 10, 2024
36f489f
correctly formated percolator output
trishorts Sep 10, 2024
719f33d
adding unit tests
trishorts Sep 11, 2024
6b86df7
unit test 2
trishorts Sep 11, 2024
ddd7460
fix unit test 1
trishorts Sep 11, 2024
3782c5e
fix unit test 2
trishorts Sep 11, 2024
983f54c
fix unit test for psm intensity in psmdata we don't have normalizatio…
trishorts Sep 11, 2024
8eb52e0
fix last unit test
trishorts Sep 11, 2024
fa56b88
fixed percolator output second row header with default direction
trishorts Sep 11, 2024
4c03b28
remove space between Length and the number in the percolator header
trishorts Sep 11, 2024
ba178bf
correct formatting
trishorts Sep 13, 2024
37a32b9
fixed unit tests
trishorts Sep 13, 2024
f200d02
fix psmdata tostring
trishorts Sep 13, 2024
714d623
merge upstream
trishorts Sep 30, 2024
dfcdd40
Merge branch 'master' into addFraggerHyperScore
nbollis Oct 4, 2024
aa7ea4a
add xcorr to PEP
trishorts Nov 13, 2024
7111053
Merge branch 'addFraggerHyperScore' of https://github.com/trishorts/M…
trishorts Nov 13, 2024
9e81ffa
optimized for speed
trishorts Nov 13, 2024
7918e33
merge upstream
trishorts Nov 13, 2024
7de17ef
log 10 factorial tests
trishorts Nov 13, 2024
7e17c0e
xcorr unit tests
trishorts Nov 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 138 additions & 10 deletions MetaMorpheus/EngineLayer/FdrAnalysis/PEPAnalysisEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
using Omics.Modifications;
using Omics;
using Easy.Common.Extensions;
using MathNet.Numerics.LinearAlgebra.Solvers;

namespace EngineLayer
{
Expand Down Expand Up @@ -56,7 +57,7 @@ public class PepAnalysisEngine
//The value Tuple is the average and standard deviation, respectively, of the predicted hydrophobicities of the observed peptides eluting at that rounded retention time.
public Dictionary<string, Dictionary<int, Tuple<double, double>>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified { get; private set; }
public Dictionary<string, Dictionary<int, Tuple<double, double>>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified { get; private set; }
public Dictionary<string, Dictionary<int, Tuple<double, double>>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE { get; private set; }
public Dictionary<string, Dictionary<int, Tuple<double, double>>> FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE { get; private set; }

/// <summary>
/// A dictionary which stores the chimeric ID string in the key and the number of chimeric identifications as the vale
Expand Down Expand Up @@ -108,7 +109,7 @@ public string ComputePEPValuesForAllPSMs()
? SpectralMatchGroup.GroupByBaseSequence(AllPsms)
: SpectralMatchGroup.GroupByIndividualPsm(AllPsms);

if(UsePeptideLevelQValueForTraining && (peptideGroups.Count(g => g.BestMatch.IsDecoy) < 4 || peptideGroups.Count(g => !g.BestMatch.IsDecoy) < 4))
if (UsePeptideLevelQValueForTraining && (peptideGroups.Count(g => g.BestMatch.IsDecoy) < 4 || peptideGroups.Count(g => !g.BestMatch.IsDecoy) < 4))
{
// If we don't have enough peptides to train at the peptide level, we will train at the PSM level
peptideGroups = SpectralMatchGroup.GroupByIndividualPsm(AllPsms);
Expand All @@ -120,9 +121,9 @@ public string ComputePEPValuesForAllPSMs()
IEnumerable<PsmData>[] PSMDataGroups = new IEnumerable<PsmData>[numGroups];
for (int i = 0; i < numGroups; i++)
{
PSMDataGroups[i] = CreatePsmData(SearchType, peptideGroups, peptideGroupIndices[i]);
PSMDataGroups[i] = CreatePsmData(SearchType, peptideGroups, peptideGroupIndices[i]);

if(!PSMDataGroups[i].Any(p => p.Label) || !PSMDataGroups[i].Any(p => !p.Label))
if (!PSMDataGroups[i].Any(p => p.Label) || !PSMDataGroups[i].Any(p => !p.Label))
{
return "Posterior error probability analysis failed. This can occur for small data sets when some sample groups are missing positive or negative training examples.";
}
Expand Down Expand Up @@ -178,11 +179,11 @@ public void BuildFileSpecificDictionaries(List<SpectralMatch> trainingData, stri
{
FileSpecificMedianFragmentMassErrors = GetFileSpecificMedianFragmentMassError(trainingData);
ChargeStateMode = GetChargeStateMode(trainingData);

if (trainingVariables.Contains("HydrophobicityZScore"))
{
FileSpecificTimeDependantHydrophobicityAverageAndDeviation_unmodified = ComputeHydrophobicityValues(trainingData, false);
FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = ComputeHydrophobicityValues(trainingData, true);
FileSpecificTimeDependantHydrophobicityAverageAndDeviation_modified = ComputeHydrophobicityValues(trainingData, true);
FileSpecificTimeDependantHydrophobicityAverageAndDeviation_CZE = ComputeMobilityValues(trainingData);
}
}
Expand Down Expand Up @@ -465,7 +466,7 @@ public int Compute_PSM_PEP(List<SpectralMatchGroup> peptideGroups,
psm.PsmFdrInfo.PEP = 1 - pepValuePredictions.Max();
psm.PeptideFdrInfo.PEP = 1 - pepValuePredictions.Max();
}

}
}

Expand Down Expand Up @@ -495,6 +496,7 @@ public PsmData CreateOnePsmDataEntry(string searchType, SpectralMatch psm, IBioP
float peaksInPrecursorEnvelope = 0;
float mostAbundantPrecursorPeakIntensity = 0;
float fractionalIntensity = 0;
float fragerHyperScoreByLength = 0;

float missedCleavages = 0;
float longestSeq = 0;
Expand All @@ -520,9 +522,9 @@ public PsmData CreateOnePsmDataEntry(string searchType, SpectralMatch psm, IBioP
normalizationFactor = 1.0;
}
// count only terminal fragment ions
totalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType == null) / normalizationFactor * multiplier, 0));
totalMatchingFragmentCount = (float)psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count();
internalMatchingFragmentCount = (float)(Math.Round(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide].Count(p => p.NeutralTheoreticalProduct.SecondaryProductType != null) / normalizationFactor * multiplier, 0));
intensity = (float)Math.Min(50, Math.Round((psm.Score - (int)psm.Score) / normalizationFactor * Math.Pow(multiplier, 2), 0));
intensity = (float)(psm.Score - (int)psm.Score);
chargeDifference = -Math.Abs(ChargeStateMode - psm.ScanPrecursorCharge);
deltaScore = (float)Math.Round(psm.DeltaScore / normalizationFactor * multiplier, 0);
notch = notchToUse;
Expand All @@ -532,6 +534,7 @@ public PsmData CreateOnePsmDataEntry(string searchType, SpectralMatch psm, IBioP
absoluteFragmentMassError = (float)Math.Min(100.0, Math.Round(10.0 * Math.Abs(GetAverageFragmentMassError(psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide]) - FileSpecificMedianFragmentMassErrors[Path.GetFileName(psm.FullFilePath)])));
}


ambiguity = Math.Min((float)(psm.BioPolymersWithSetModsToMatchingFragments.Keys.Count - 1), 10);
//ambiguity = 10; // I'm pretty sure that you shouldn't train on ambiguity and its skewing the results
longestSeq = (float)Math.Round(SpectralMatch.GetLongestIonSeriesBidirectional(psm.BioPolymersWithSetModsToMatchingFragments, selectedPeptide) / normalizationFactor * multiplier, 0);
Expand All @@ -543,6 +546,7 @@ public PsmData CreateOnePsmDataEntry(string searchType, SpectralMatch psm, IBioP
peaksInPrecursorEnvelope = psm.PrecursorScanEnvelopePeakCount;
mostAbundantPrecursorPeakIntensity = (float)Math.Round((float)psm.PrecursorScanIntensity / normalizationFactor * multiplier, 0);
fractionalIntensity = (float)psm.PrecursorFractionalIntensity;
fragerHyperScoreByLength = GetFraggerHyperScore(psm, selectedPeptide);

if (PsmHasSpectralAngle(psm))
{
Expand Down Expand Up @@ -660,6 +664,7 @@ public PsmData CreateOnePsmDataEntry(string searchType, SpectralMatch psm, IBioP
MostAbundantPrecursorPeakIntensity = mostAbundantPrecursorPeakIntensity,
PrecursorFractionalIntensity = fractionalIntensity,
InternalIonCount = internalMatchingFragmentCount,
FraggerHyperScorebyLength = fragerHyperScoreByLength
};

return psm.PsmData_forPEPandPercolator;
Expand Down Expand Up @@ -732,7 +737,7 @@ public Dictionary<string, Dictionary<int, Tuple<double, double>>> ComputeHydroph
}
fullSequences.Add(pwsm.FullSequence);

double predictedHydrophobicity = pwsm is PeptideWithSetModifications pep ? calc.ScoreSequence(pep) : 0;
double predictedHydrophobicity = pwsm is PeptideWithSetModifications pep ? calc.ScoreSequence(pep) : 0;

//here i'm grouping this in 2 minute increments becuase there are cases where you get too few data points to get a good standard deviation an average. This is for stability.
int possibleKey = (int)(2 * Math.Round(psm.ScanRetentionTime / 2d, 0));
Expand Down Expand Up @@ -1065,7 +1070,130 @@ public static float GetAverageFragmentMassError(IEnumerable<MatchedFragmentIon>

return massErrors.Average();
}
/// <summary>
/// Taken from Nat. Methods.https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5409104/
/// "MSFragger: ultrafast and comprehensive peptide identification in shotgun proteomics"
/// Andy T. Kong,1,2 Felipe V. Leprevost,2 Dmitry M. Avtonomov,2 Dattatreya Mellacheruvu,2 and Alexey I. Nesvizhskii1,2,*
/// </summary>
/// <param name="psm"></param>
/// <param name="selectedPeptide"></param>
/// <returns></returns>
public static float GetFraggerHyperScore(SpectralMatch psm, IBioPolymerWithSetMods selectedPeptide)
{
var peptideFragmentIons = psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide];
float nIonIntensitySum = 0;
float cIonIntensitySum = 0;
int nIonCount = 0;
int cIonCount = 0;

foreach (var ion in peptideFragmentIons)
{
if (ion.NeutralTheoreticalProduct.Terminus == FragmentationTerminus.N)
{
nIonIntensitySum += (float)ion.Intensity;
nIonCount++;
}
else if (ion.NeutralTheoreticalProduct.Terminus == FragmentationTerminus.C)
{
cIonIntensitySum += (float)ion.Intensity;
cIonCount++;
}
}

float matched_n_IonCountFactorial = nIonCount > 0 ? GetLog10Factorial(nIonCount).Value : 0;
float matched_c_IonCountFactorial = cIonCount > 0 ? GetLog10Factorial(cIonCount).Value : 0;

double log10IntensitySum = (nIonIntensitySum > 0 && cIonIntensitySum > 0) ? Math.Log10(nIonIntensitySum * cIonIntensitySum) : 0.1;

return matched_n_IonCountFactorial + matched_c_IonCountFactorial + (float)log10IntensitySum;
}
/// <summary>
/// https://willfondrie.com/2019/02/an-intuitive-look-at-the-xcorr-score-function-in-proteomics/
///
/// A mass spectrum can be preprocessed by subtracting the mean intensities at all of the offsets.
/// Then a single dot product between the preprocessed mass spectrum and the theoretical peptide
/// mass spectrum yields the xcorr score, which is made possible because of the distributive
/// property of the dot product.
///
/// Since we have already chosen the match for this scan, we can use the matched ions to calculate the
/// xcorr and skip the dot product step.
///
/// </summary>
/// <param name="psm"></param>
/// <param name="selectedPeptide"></param>
/// <returns></returns>
public static float Xcorr(SpectralMatch psm, IBioPolymerWithSetMods selectedPeptide)
{
double xcorr = 0;
var xArray = psm.MsDataScan.MassSpectrum.XArray;
var yArray = psm.MsDataScan.MassSpectrum.YArray;
var fragments = psm.BioPolymersWithSetModsToMatchingFragments[selectedPeptide];

foreach (var peptideFragmentIon in fragments)
{
int startIndex = Array.BinarySearch(xArray, peptideFragmentIon.Mz - 75);
int endIndex = Array.BinarySearch(xArray, peptideFragmentIon.Mz + 75);

// Ensure valid indices
startIndex = startIndex < 0 ? ~startIndex : startIndex;
endIndex = endIndex < 0 ? ~endIndex - 1 : endIndex;

// Sum yArray values between startIndex and endIndex
double sum = 0;
for (int i = startIndex; i <= endIndex; i++)
{
sum += yArray[i];
}
sum -= peptideFragmentIon.Intensity; // Subtract the intensity of the current ion

double range = xArray[endIndex] - xArray[startIndex];
if (range > 0)
{
sum /= range;
}

xcorr += Math.Max(peptideFragmentIon.Intensity - sum, 0);
}

return (float)xcorr;
}

public static float? GetLog10Factorial(int n)
{
if (n < 0)
{
throw new ArgumentOutOfRangeException(nameof(n), "Input must be non-negative.");
}

// Use a precomputed array for small values of n
double[] precomputedLog10Factorials = new double[]
{
0.0, // 0!
0.0, // 1!
0.3010, // 2!
0.7782, // 3!
1.2553, // 4!
1.7324, // 5!
2.2095, // 6!
2.6866, // 7!
3.1637, // 8!
3.6408, // 9!
4.1179 // 10!
};

if (n < precomputedLog10Factorials.Length)
{
return (float)precomputedLog10Factorials[n];
}

double log10Factorial = precomputedLog10Factorials[precomputedLog10Factorials.Length - 1];
for (int i = precomputedLog10Factorials.Length; i <= n; i++)
{
log10Factorial += Math.Log10(i);
}

return (float)log10Factorial;
}
#endregion
}
}
13 changes: 11 additions & 2 deletions MetaMorpheus/EngineLayer/FdrAnalysis/PsmData.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public class PsmData
"TotalMatchingFragmentCount", "Intensity", "PrecursorChargeDiffToMode", "DeltaScore",
"Notch", "ModsCount", "AbsoluteAverageFragmentMassErrorFromMedian", "MissedCleavagesCount",
"Ambiguity", "LongestFragmentIonSeries", "ComplementaryIonCount", "HydrophobicityZScore",
"IsVariantPeptide", "IsDeadEnd", "IsLoop", "SpectralAngle", "HasSpectralAngle",
"IsVariantPeptide", "SpectralAngle", "HasSpectralAngle", "FraggerHyperScorebyLength"
}
},

Expand All @@ -26,7 +26,7 @@ public class PsmData
"Notch", "ModsCount", "AbsoluteAverageFragmentMassErrorFromMedian", "Ambiguity",
"LongestFragmentIonSeries", "ComplementaryIonCount", "SpectralAngle",
"HasSpectralAngle", "PeaksInPrecursorEnvelope", "ChimeraCount",
"MostAbundantPrecursorPeakIntensity", "PrecursorFractionalIntensity", "InternalIonCount"
"MostAbundantPrecursorPeakIntensity", "PrecursorFractionalIntensity", "InternalIonCount", "FraggerHyperScorebyLength"
}
},

Expand Down Expand Up @@ -75,6 +75,7 @@ public class PsmData
{ "MostAbundantPrecursorPeakIntensity", 1 },
{ "PrecursorFractionalIntensity", 1 },
{ "InternalIonCount", 1},
{ "FraggerHyperScorebyLength", 1},
}.ToImmutableDictionary();

public string ToString(string searchType)
Expand All @@ -93,6 +94,11 @@ public string ToString(string searchType)
return sb.ToString();
}

//If you need to add an array instead of just one feature you can use the following code
//[LoadColumn(29, 103)]
//[VectorType(75)]
//public float[] FraggerHyperScorebyLength { get; set; }

[LoadColumn(0)]
public float Intensity { get; set; }

Expand Down Expand Up @@ -179,5 +185,8 @@ public string ToString(string searchType)

[LoadColumn(28)]
public float InternalIonCount { get; set; }

[LoadColumn(29)]
public float FraggerHyperScorebyLength { get; set; }
}
}
52 changes: 31 additions & 21 deletions MetaMorpheus/TaskLayer/SearchTask/PostSearchAnalysisTask.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1778,45 +1778,55 @@ private static void WritePsmsForPercolator(List<SpectralMatch> psmList, string w
{
using (StreamWriter output = new StreamWriter(writtenFileForPercolator))
{
string searchType;
string searchType = "standard";
if (psmList.Where(p => p != null).Any() && psmList[0].DigestionParams.Protease.Name != null && psmList[0].DigestionParams.Protease.Name == "top-down")
{
searchType = "top-down";
}
else

//header
string header = "SpecId\tLabel\tScanNr";

StringBuilder sb = new StringBuilder();
var variablesToOutput = PsmData.trainingInfos[searchType];

foreach (var variable in variablesToOutput)
{
searchType = "standard";
sb.Append("\t");
sb.Append(variable.ToString());
}

string header = "SpecId\tLabel\tScanNr\t";
header += String.Join("\t", PsmData.trainingInfos[searchType]);
header += sb.ToString();
header += "\tPeptide\tProteins";

output.WriteLine(header);

StringBuilder directions = new StringBuilder();
directions.Append("DefaultDirection\t-\t-");
//direction
string direction = "DefaultDirection\t\t";

foreach (var headerVariable in PsmData.trainingInfos[searchType])
sb = new StringBuilder();
variablesToOutput = PsmData.trainingInfos[searchType];

foreach (var variable in variablesToOutput)
{
directions.Append("\t");
directions.Append(PsmData.assumedAttributeDirection[headerVariable]);
sb.Append("\t");
sb.Append(PsmData.assumedAttributeDirection[variable]);
}
direction += sb.ToString();
direction += "\t\t";
output.WriteLine(direction);

output.WriteLine(directions.ToString());

//psmdata lines
int idNumber = 0;
psmList.OrderByDescending(p => p.Score);
psmList.OrderByDescending(p => p);
foreach (SpectralMatch psm in psmList.Where(p => p.PsmData_forPEPandPercolator != null))
{
foreach (var peptide in psm.BestMatchingBioPolymersWithSetMods)
{
output.Write(idNumber.ToString());
output.Write('\t' + (peptide.Peptide.Parent.IsDecoy ? -1 : 1).ToString());
output.Write('\t' + psm.ScanNumber.ToString());
output.Write(psm.PsmData_forPEPandPercolator.ToString(searchType));
output.Write('\t' + (peptide.Peptide.PreviousResidue + "." + peptide.Peptide.FullSequence + "." + peptide.Peptide.NextResidue).ToString());
output.Write('\t' + (peptide.Peptide.Parent.Accession).ToString());
output.Write(idNumber.ToString()); //id number
output.Write('\t' + (peptide.Peptide.Parent.IsDecoy ? -1 : 1).ToString()); //label
output.Write('\t' + psm.ScanNumber.ToString()); //scan number
output.Write(psm.PsmData_forPEPandPercolator.ToString(searchType));//psmdata
output.Write('\t' + (peptide.Peptide.PreviousResidue + "." + peptide.Peptide.FullSequence + "." + peptide.Peptide.NextResidue).ToString());//peptide
output.Write('\t' + (peptide.Peptide.Parent.Accession).ToString());//proteins
output.WriteLine();
}
idNumber++;
Expand Down
Loading