From a652d70b1048986bec143d8f7737010778e85391 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Sun, 1 Nov 2020 01:42:50 -0700 Subject: [PATCH 01/14] Group-FDR is tentatively working. --- mokapot/confidence.py | 90 +++++++++++++++++++++++++++++++++++++++---- mokapot/dataset.py | 81 +++++++++++++++++++++++--------------- mokapot/parsers.py | 40 +++++++++---------- mokapot/utils.py | 5 +-- tests/test_dataset.py | 3 +- 5 files changed, 154 insertions(+), 65 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 3760f35b..5b277f1b 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -1,8 +1,7 @@ """ One of the primary purposes of mokapot is to assign confidence estimates to PSMs. This task is accomplished by ranking PSMs according to a score or metric and -using an appropriate confidence estimation procedure for the type of data -(currently, linear and cross-linked PSMs are supported). In either case, +using an appropriate confidence estimation procedure for the type of data. mokapot can provide confidence estimates based any score, regardless of whether it was the result of a learned :py:func:`mokapot.model.Model` instance or provided independently. @@ -11,10 +10,10 @@ provided score. In either case, they provide utilities to access, save, and plot these estimates for the various relevant levels (i.e. PSMs, peptides, and proteins). The :py:func:`LinearConfidence` class is appropriate for most -proteomics datasets, whereas the :py:func:`CrossLinkedConfidence` is -specifically designed for crosslinked peptides. +proteomics datasets. """ import os +import copy import logging import pandas as pd @@ -29,6 +28,83 @@ # Classes --------------------------------------------------------------------- +class GroupedConfidence: + """ + Performed grouped confidence estimation for a collection of PSMs. + + Parameters + ---------- + psms : LinearPsmDataset object + A collection of PSMs. + scores : np.ndarray + A vector containing the score of each PSM. + desc : bool + Are higher scores better? + eval_fdr : float + The FDR threshold at which to report performance. This parameter + has no affect on the analysis itself, only logging messages. + """ + + def __init__(self, psms, scores, desc=True, eval_fdr=0.01): + """Initialize a GroupedConfidence object""" + group_psms = copy.copy(psms) + group_psms._group_column = None + scores = scores * (desc * 2 - 1) + + # Do TDC + scores = ( + pd.Series(scores, index=psms._data.index) + .sample(frac=1) + .sort_values() + ) + + idx = ( + psms.data.loc[scores.index, :] + .drop_duplicates(psms._spectrum_columns, keep="last") + .index + ) + + self.group_confidence_estimates = {} + for group, group_df in psms._data.groupby(psms._group_column): + group_psms._data = None + tdc_winners = group_df.index.intersection(idx) + group_psms._data = group_df.loc[tdc_winners, :] + group_scores = scores.loc[group_psms._data.index].values + 1 + res = group_psms.assign_confidence( + (group_scores + 1) / (2 * desc), desc=desc, eval_fdr=eval_fdr + ) + self.group_confidence_estimates[group] = res + + def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False): + """ + Save confidence estimates to delimited text files. + + Parameters + ---------- + dest_dir : str or None, optional + The directory in which to save the files. `None` will use the + current working directory. + file_root : str or None, optional + An optional prefix for the confidence estimate files. The + suffix will always be `mokapot.psms.txt` and + `mokapot.peptides.txt`. + sep : str, optional + The delimiter to use. + decoys : bool, optional + Save decoys confidence estimates as well? + + Returns + ------- + list of str + The paths to the saved files. + """ + for group, res in self.group_confidence_estimates.items(): + prefix = file_root + f".{group}" + res.to_txt( + dest_dir=dest_dir, file_root=prefix, sep=sep, decoys=decoys + ) + + class Confidence: """ Estimate and store the statistical confidence for a collection of @@ -90,9 +166,9 @@ def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False): An optional prefix for the confidence estimate files. The suffix will always be `mokapot.psms.txt` and `mokapot.peptides.txt`. - sep : str + sep : str, optional The delimiter to use. - decoys : bool + decoys : bool, optional Save decoys confidence estimates as well? Returns @@ -283,7 +359,7 @@ def _assign_confidence(self, desc=True): proteins = picked_protein( peptides, self._target_column, - self._peptide_column[0], + self._peptide_column, self._score_column, self._proteins, ) diff --git a/mokapot/dataset.py b/mokapot/dataset.py index be3f78f1..0ee17e2d 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -2,10 +2,7 @@ The :py:class:`LinearPsmDataset` and :py:class:`CrossLinkedPsmDataset` classes are used to define collections peptide-spectrum matches. The :py:class:`LinearPsmDataset` class is suitable for most types of -data-dependent acquisition proteomics experiments, whereas the -:py:class:`CrossLinkedPsmDataset` is specifically designed for -collections of cross-linked PSMs (CSMs) originating from -cross-linking proteomics experiments. +data-dependent acquisition proteomics experiments. Although either class can be constructed from a :py:class:`pandas.DataFrame`, it is often easier to load the PSMs directly @@ -27,8 +24,12 @@ from . import qvalues from . import utils from .proteins import read_fasta -from .confidence import LinearConfidence, CrossLinkedConfidence from .proteins import FastaProteins +from .confidence import ( + LinearConfidence, + CrossLinkedConfidence, + GroupedConfidence, +) LOGGER = logging.getLogger(__name__) @@ -90,7 +91,13 @@ def _update_labels(self, scores, eval_fdr, desc): return def __init__( - self, psms, spectrum_columns, feature_columns, other_columns, copy_data + self, + psms, + spectrum_columns, + feature_columns, + group_column, + other_columns, + copy_data, ): """Initialize an object""" self._data = psms.copy(deep=copy_data).reset_index(drop=True) @@ -98,14 +105,22 @@ def __init__( # Set columns self._spectrum_columns = utils.tuplize(spectrum_columns) + self._group_column = group_column if other_columns is not None: other_columns = utils.tuplize(other_columns) else: other_columns = () + if group_column is not None: + group_column = (group_column,) + else: + group_column = () + # Check that all of the columns exist: - used_columns = sum([other_columns, self._spectrum_columns], tuple()) + used_columns = sum( + [other_columns, self._spectrum_columns, group_column], tuple(), + ) missing_columns = [c not in self.data.columns for c in used_columns] if not missing_columns: @@ -162,6 +177,13 @@ def spectra(self): """ return self.data.loc[:, self._spectrum_columns] + @property + def groups(self): + """ + A :py:class:`pandas.Series` of the groups for confidence estimation. + """ + return self.data.loc[:, self._group_column] + @property def columns(self): """The columns of the dataset.""" @@ -336,7 +358,9 @@ class LinearPsmDataset(PsmDataset): protein_column : str, optional The column that specifies which protein(s) the detected peptide might have originated from. This column is not used to compute - protein-level confidence estimates (see :py:meth:`add_fasta()`). + protein-level confidence estimates (see :py:meth:`add_proteins()`). + group_column : str, optional + A factor by which to group PSMs for grouped confidence estimation. feature_columns : str or tuple of str, optional The column(s) specifying the feature(s) for mokapot analysis. If `None`, these are assumed to be all columns not specified in the @@ -353,7 +377,8 @@ class LinearPsmDataset(PsmDataset): metadata : pandas.DataFrame features : pandas.DataFrame spectra : pandas.DataFrame - peptides : pandas.DataFrame + peptides : pandas.Series + groups : pandas.Series targets : numpy.ndarray columns : list of str has_proteins : bool @@ -365,39 +390,26 @@ def __init__( target_column, spectrum_columns, peptide_column, - protein_column, + protein_column=None, + group_column=None, feature_columns=None, copy_data=True, ): """Initialize a PsmDataset object.""" self._target_column = target_column - self._peptide_column = utils.tuplize(peptide_column) - - if protein_column is not None: - self._protein_column = utils.tuplize(protein_column) - else: - self._protein_column = tuple() - - # Some error checking: - if len(self._peptide_column) > 1: - raise ValueError( - "Only one column can be used for " "'peptide_column'" - ) + self._peptide_column = peptide_column + self._protein_column = protein_column # Finish initialization - other_columns = sum( - [ - utils.tuplize(self._target_column), - self._peptide_column, - self._protein_column, - ], - tuple(), - ) + other_columns = [target_column, peptide_column] + if protein_column is not None: + other_columns += [protein_column] super().__init__( psms=psms, spectrum_columns=spectrum_columns, feature_columns=feature_columns, + group_column=group_column, other_columns=other_columns, copy_data=copy_data, ) @@ -438,7 +450,7 @@ def targets(self): @property def peptides(self): - """A :py:class:`pandas.DataFrame` of the peptide column.""" + """A :py:class:`pandas.Series` of the peptide column.""" return self.data.loc[:, self._peptide_column] def _update_labels(self, scores, eval_fdr=0.01, desc=True): @@ -509,7 +521,12 @@ def assign_confidence(self, scores=None, desc=True, eval_fdr=0.01): LOGGER.info("Selected %s as the best feature.", feat) scores = self.features[feat].values - return LinearConfidence(self, scores, eval_fdr=eval_fdr, desc=desc) + if self._group_column is None: + return LinearConfidence(self, scores, eval_fdr=eval_fdr, desc=desc) + else: + return GroupedConfidence( + self, scores, eval_fdr=eval_fdr, desc=desc + ) class CrossLinkedPsmDataset(PsmDataset): diff --git a/mokapot/parsers.py b/mokapot/parsers.py index 542c3052..844d8a6d 100644 --- a/mokapot/parsers.py +++ b/mokapot/parsers.py @@ -14,7 +14,7 @@ LOGGER = logging.getLogger(__name__) # Functions ------------------------------------------------------------------- -def read_pin(pin_files, to_df=False): +def read_pin(pin_files, group_column=None, to_df=False): """ Read Percolator input (PIN) tab-delimited files. @@ -39,7 +39,10 @@ def read_pin(pin_files, to_df=False): ---------- pin_files : str or tuple of str One or more PIN files to read. - to_df : bool + group_column : str, optional + A factor to by which to group PSMs for grouped confidence + estimation. + to_df : bool, optional Return a :py:class:`pandas.DataFrame` instead of a py:class:`~mokapot.dataset.LinearPsmDataset`. @@ -53,27 +56,20 @@ def read_pin(pin_files, to_df=False): pin_df = pd.concat([read_percolator(f) for f in utils.tuplize(pin_files)]) # Find all of the necessary columns, case-insensitive: - specid = tuple(c for c in pin_df.columns if c.lower() == "specid") - peptides = tuple(c for c in pin_df.columns if c.lower() == "peptide") - proteins = tuple(c for c in pin_df.columns if c.lower() == "proteins") - labels = tuple(c for c in pin_df.columns if c.lower() == "label") - other = tuple(c for c in pin_df.columns if c.lower() == "calcmass") - spectra = tuple( - c for c in pin_df.columns if c.lower() in ["scannr", "expmass"] - ) - - nonfeat = sum( - [specid, spectra, peptides, proteins, labels, other], tuple() - ) - - features = tuple(c for c in pin_df.columns if c not in nonfeat) + specid = [c for c in pin_df.columns if c.lower() == "specid"] + peptides = [c for c in pin_df.columns if c.lower() == "peptide"] + proteins = [c for c in pin_df.columns if c.lower() == "proteins"] + labels = [c for c in pin_df.columns if c.lower() == "label"] + other = [c for c in pin_df.columns if c.lower() == "calcmass"] + spectra = [c for c in pin_df.columns if c.lower() in ["scannr", "expmass"]] + nonfeat = sum([specid, spectra, peptides, proteins, labels, other], []) + features = [c for c in pin_df.columns if c not in nonfeat] # Check for errors: - if len(labels) > 1: - raise ValueError("More than one label column found in pin file.") - - if len(proteins) > 1: - raise ValueError("More than one protein column found in pin file.") + col_names = ["Label", "Peptide", "Proteins"] + for col, name in zip([labels, peptides, proteins], col_names): + if len(col) > 1: + raise ValueError(f"More than one '{name}' column found.") if not all([specid, peptides, proteins, labels, spectra]): raise ValueError( @@ -91,7 +87,7 @@ def read_pin(pin_files, to_df=False): psms=pin_df, target_column=labels[0], spectrum_columns=spectra, - peptide_column=peptides, + peptide_column=peptides[0], protein_column=proteins[0], feature_columns=features, copy_data=False, diff --git a/mokapot/utils.py b/mokapot/utils.py index 00068c9a..6d1ae6b6 100644 --- a/mokapot/utils.py +++ b/mokapot/utils.py @@ -9,6 +9,7 @@ def groupby_max(df, by_cols, max_col): """Quickly get the indices for the maximum value of col""" + by_cols = tuplize(by_cols) idx = ( df.sample(frac=1) .sort_values(list(by_cols) + [max_col], axis=0) @@ -67,7 +68,5 @@ def tuplize(obj): else: if isinstance(obj, str): obj = (obj,) - else: - tuple(obj) - return obj + return tuple(obj) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 59c7617b..7967f83d 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -45,7 +45,7 @@ def test_linear_init(): metadata = ["target", "spectrum", "peptide", "protein"] pd.testing.assert_frame_equal(dset.spectra, DAT1.loc[:, ["spectrum"]]) - pd.testing.assert_frame_equal(dset.peptides, DAT1.loc[:, ["peptide"]]) + pd.testing.assert_series_equal(dset.peptides, DAT1.loc[:, "peptide"]) pd.testing.assert_frame_equal(dset.features, DAT1.loc[:, features]) pd.testing.assert_frame_equal(dset.metadata, DAT1.loc[:, metadata]) assert dset.columns == DAT1.columns.tolist() @@ -66,6 +66,7 @@ def test_update_labels(): spectrum_columns="spectrum", peptide_column="peptide", protein_column="protein", + group_column=None, feature_columns=None, copy_data=True, ) From b8dce007be34e61f404fb54ad6b17f352e9c8404 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Sun, 1 Nov 2020 13:46:58 -0800 Subject: [PATCH 02/14] Fixed linting --- mokapot/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mokapot/dataset.py b/mokapot/dataset.py index 0ee17e2d..63dde838 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -119,7 +119,7 @@ def __init__( # Check that all of the columns exist: used_columns = sum( - [other_columns, self._spectrum_columns, group_column], tuple(), + [other_columns, self._spectrum_columns, group_column], tuple() ) missing_columns = [c not in self.data.columns for c in used_columns] From 29f819c1f48a71a51664bc51085c334bbc2ddb7d Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Sun, 1 Nov 2020 14:32:25 -0800 Subject: [PATCH 03/14] Made grouped confidence estimates easier to use. --- mokapot/confidence.py | 29 ++++++++++++++++++++++++++++- mokapot/parsers.py | 40 ++++++++++++++++++++++++++++++---------- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 5b277f1b..e33bae98 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -48,6 +48,7 @@ class GroupedConfidence: def __init__(self, psms, scores, desc=True, eval_fdr=0.01): """Initialize a GroupedConfidence object""" group_psms = copy.copy(psms) + self.group_column = group_psms._group_column group_psms._group_column = None scores = scores * (desc * 2 - 1) @@ -98,11 +99,37 @@ def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False): list of str The paths to the saved files. """ + ret_files = [] for group, res in self.group_confidence_estimates.items(): prefix = file_root + f".{group}" - res.to_txt( + new_files = res.to_txt( dest_dir=dest_dir, file_root=prefix, sep=sep, decoys=decoys ) + ret_files.append(new_files) + + return ret_files + + def __repr__(self): + """Nice printing.""" + ngroups = len(self.group_confidence_estimates) + lines = [ + "A mokapot.confidence.GroupedConfidence object with " + f"{ngroups} groups:\n" + ] + + for group, conf in self.group_confidence_estimates.items(): + lines += [f"Group: {self.group_column} == {group}"] + lines += ["-" * len(lines[-1])] + lines += [str(conf)] + + return "\n".join(lines) + + def __getattr__(self, attr): + """Make groups accessible easily""" + try: + return self.grouped_confidence_estimates[attr] + except KeyError: + raise AttributeError class Confidence: diff --git a/mokapot/parsers.py b/mokapot/parsers.py index 844d8a6d..bf2da534 100644 --- a/mokapot/parsers.py +++ b/mokapot/parsers.py @@ -14,7 +14,7 @@ LOGGER = logging.getLogger(__name__) # Functions ------------------------------------------------------------------- -def read_pin(pin_files, group_column=None, to_df=False): +def read_pin(pin_files, group_column=None, to_df=False, copy_data=False): """ Read Percolator input (PIN) tab-delimited files. @@ -27,24 +27,33 @@ def read_pin(pin_files, group_column=None, to_df=False): Specifically, mokapot requires specific columns in the tab-delmited files: `specid`, `scannr`, `peptide`, `proteins`, and - `label`. Note that these column names are insensitive. In addition - to the required columns, mokapot will look for an `expmass` column, - which is generated by `Crux `_, but is not - intended to be a feature. + `label`. Note that these column names are case insensitive. In + addition to the required columns, mokapot will look for an `expmass` + and `calcmass` columns, which are generated by + `Crux `_, but are not intended to be features. - Additionally, mokapot does not currently support specifying a + In addition to PIN tab-delimited files, the `pin_files` argument + can a :py:class:`pandas.DataFrame` containing the above columns. + + Finally, mokapot does not currently support specifying a default direction or feature weights in the PIN file itself. Parameters ---------- - pin_files : str or tuple of str - One or more PIN files to read. + pin_files : str, tuple of str, or pandas.DataFrame + One or more PIN files to read or a :py:class:`pandas.DataFrame`. group_column : str, optional A factor to by which to group PSMs for grouped confidence estimation. to_df : bool, optional Return a :py:class:`pandas.DataFrame` instead of a py:class:`~mokapot.dataset.LinearPsmDataset`. + copy_data : bool, optional + If true, a deep copy of the data is created. This + uses more memory, but is safer because it prevents + accidental modification of the underlying data. This + argument only has an effect when `pin_files` is a + :py:class:`pandas.DataFrame` Returns ------- @@ -53,7 +62,13 @@ def read_pin(pin_files, group_column=None, to_df=False): containing the PSMs from all of the PIN files. """ logging.info("Parsing PSMs...") - pin_df = pd.concat([read_percolator(f) for f in utils.tuplize(pin_files)]) + + if isinstance(pin_files, pd.DataFrame): + pin_df = pin_files.copy(deep=copy_data) + else: + pin_df = pd.concat( + [read_percolator(f) for f in utils.tuplize(pin_files)] + ) # Find all of the necessary columns, case-insensitive: specid = [c for c in pin_df.columns if c.lower() == "specid"] @@ -63,6 +78,9 @@ def read_pin(pin_files, group_column=None, to_df=False): other = [c for c in pin_df.columns if c.lower() == "calcmass"] spectra = [c for c in pin_df.columns if c.lower() in ["scannr", "expmass"]] nonfeat = sum([specid, spectra, peptides, proteins, labels, other], []) + if group_column is not None: + nonfeat += [group_column] + features = [c for c in pin_df.columns if c not in nonfeat] # Check for errors: @@ -78,7 +96,8 @@ def read_pin(pin_files, group_column=None, to_df=False): ) # Convert labels to the correct format. - pin_df[labels[0]] = (pin_df[labels[0]] + 1) / 2 + if any(pin_df[labels[0]] == -1): + pin_df[labels[0]] = (pin_df[labels[0]] + 1) / 2 if to_df: return pin_df @@ -89,6 +108,7 @@ def read_pin(pin_files, group_column=None, to_df=False): spectrum_columns=spectra, peptide_column=peptides[0], protein_column=proteins[0], + group_column=group_column, feature_columns=features, copy_data=False, ) From ac4352174c4934c1609c882bbd7cd9ae34415107 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Mon, 2 Nov 2020 22:47:29 -0800 Subject: [PATCH 04/14] Confidence estimation warns, but tolerates no decoys remaining. --- mokapot/confidence.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index e33bae98..110f2f06 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -400,6 +400,11 @@ def _assign_confidence(self, desc=True): ).reset_index(drop=True) scores = data.loc[:, self._score_column].values targets = data.loc[:, self._target_column].astype(bool).values + if all(targets): + logging.warning( + "No decoy PSMs remain for confidence estimation. " + "Confidence estimates may be unreliable." + ) # Estimate q-values and assign to dataframe LOGGER.info("Assiging q-values to %s...", level) @@ -423,9 +428,16 @@ def _assign_confidence(self, desc=True): # Calculate PEPs LOGGER.info("Assiging PEPs to %s...", level) - _, pep = qvality.getQvaluesFromScores( - scores[targets], scores[~targets], includeDecoys=True - ) + try: + _, pep = qvality.getQvaluesFromScores( + scores[targets], scores[~targets], includeDecoys=True + ) + except SystemExit as msg: + print(msg) + if "no decoy hits available for PEP calculation" in str(msg): + pep = 0 + else: + raise level = level.lower() data["mokapot PEP"] = pep From b9c554e1bf46d64d3befc918f2eeefd7f6b1ad98 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Wed, 4 Nov 2020 12:54:42 -0800 Subject: [PATCH 05/14] Modified logging to tell what group it's working on. --- mokapot/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mokapot/dataset.py b/mokapot/dataset.py index 63dde838..f6070c40 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -516,6 +516,7 @@ def assign_confidence(self, scores=None, desc=True, eval_fdr=0.01): A :py:class:`LinearConfidence` object storing the confidence estimates for the collection of PSMs. """ + LOGGER.info("=== Assigning Confidence ===") if scores is None: feat, _, _, desc = self._find_best_feature(eval_fdr) LOGGER.info("Selected %s as the best feature.", feat) From 8239e00a1c18bb0b9a1ff6fcb6e7048115397cdd Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Wed, 4 Nov 2020 12:57:21 -0800 Subject: [PATCH 06/14] confidence.py --- mokapot/confidence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 110f2f06..adb9df75 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -67,6 +67,7 @@ def __init__(self, psms, scores, desc=True, eval_fdr=0.01): self.group_confidence_estimates = {} for group, group_df in psms._data.groupby(psms._group_column): + LOGGER.info("Group: %s == %s", self.group_column, group) group_psms._data = None tdc_winners = group_df.index.intersection(idx) group_psms._data = group_df.loc[tdc_winners, :] @@ -315,7 +316,6 @@ class LinearConfidence(Confidence): def __init__(self, psms, scores, desc=True, eval_fdr=0.01): """Initialize a a LinearPsmConfidence object""" - LOGGER.info("=== Assigning Confidence ===") super().__init__(psms, scores, desc) self._target_column = _new_column("target", self._data) self._data[self._target_column] = psms.targets From be72861f745b30089157597b8ca19c14b127f592 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Wed, 4 Nov 2020 13:02:08 -0800 Subject: [PATCH 07/14] Clarified logging. --- mokapot/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mokapot/dataset.py b/mokapot/dataset.py index f6070c40..59be3b3c 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -516,15 +516,16 @@ def assign_confidence(self, scores=None, desc=True, eval_fdr=0.01): A :py:class:`LinearConfidence` object storing the confidence estimates for the collection of PSMs. """ - LOGGER.info("=== Assigning Confidence ===") if scores is None: feat, _, _, desc = self._find_best_feature(eval_fdr) LOGGER.info("Selected %s as the best feature.", feat) scores = self.features[feat].values if self._group_column is None: + LOGGER.info("Assigning confidence...") return LinearConfidence(self, scores, eval_fdr=eval_fdr, desc=desc) else: + LOGGER.info("Assigning confidence within groups...") return GroupedConfidence( self, scores, eval_fdr=eval_fdr, desc=desc ) From cd516ddda446c32cf090a86713c6e2380fc52857 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Wed, 4 Nov 2020 14:17:14 -0800 Subject: [PATCH 08/14] Fixed bug for ascending scores in groups --- mokapot/confidence.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index adb9df75..7be8daaa 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -73,7 +73,7 @@ def __init__(self, psms, scores, desc=True, eval_fdr=0.01): group_psms._data = group_df.loc[tdc_winners, :] group_scores = scores.loc[group_psms._data.index].values + 1 res = group_psms.assign_confidence( - (group_scores + 1) / (2 * desc), desc=desc, eval_fdr=eval_fdr + group_scores * (2 * desc - 1), desc=desc, eval_fdr=eval_fdr ) self.group_confidence_estimates[group] = res From 0ac4b571e68c4cabac6f9956021a82125c1bba3d Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Wed, 4 Nov 2020 16:51:40 -0800 Subject: [PATCH 09/14] Added selected hyperparaters to logging. --- mokapot/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mokapot/model.py b/mokapot/model.py index 93e0fbde..8fc20193 100644 --- a/mokapot/model.py +++ b/mokapot/model.py @@ -618,6 +618,8 @@ def _find_hyperparameters(model, features, labels): new_est = model.estimator.estimator new_est.set_params(**best_params) model._needs_cv = False + for param, value in best_params.items(): + LOGGER.info("\t- %s = %s", param, value) else: new_est = model.estimator From 011c9eda828944b39f48034d2045f83e4e335387 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Sat, 7 Nov 2020 13:57:56 -0800 Subject: [PATCH 10/14] Updated logging. --- mokapot/confidence.py | 2 +- mokapot/picked_protein.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 7be8daaa..a3a084ad 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -401,7 +401,7 @@ def _assign_confidence(self, desc=True): scores = data.loc[:, self._score_column].values targets = data.loc[:, self._target_column].astype(bool).values if all(targets): - logging.warning( + LOGGER.warning( "No decoy PSMs remain for confidence estimation. " "Confidence estimates may be unreliable." ) diff --git a/mokapot/picked_protein.py b/mokapot/picked_protein.py index d11e0e2e..65f03544 100644 --- a/mokapot/picked_protein.py +++ b/mokapot/picked_protein.py @@ -101,6 +101,7 @@ def picked_protein( shared_unmatched, len(prots), ) + LOGGER.debug("%s", unmatched_prots.loc[~shared, "stripped Sequence"]) prots = prots.loc[~unmatched, :] prots["decoy"] = ( From 9162632ac6a0d19c307b00e4d4e86ea344820496 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Sat, 7 Nov 2020 14:04:14 -0800 Subject: [PATCH 11/14] Fixed typo. --- mokapot/picked_protein.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mokapot/picked_protein.py b/mokapot/picked_protein.py index 65f03544..81aaae25 100644 --- a/mokapot/picked_protein.py +++ b/mokapot/picked_protein.py @@ -101,7 +101,7 @@ def picked_protein( shared_unmatched, len(prots), ) - LOGGER.debug("%s", unmatched_prots.loc[~shared, "stripped Sequence"]) + LOGGER.debug("%s", unmatched_prots.loc[~shared, "stripped sequence"]) prots = prots.loc[~unmatched, :] prots["decoy"] = ( From 195b0b6981ca849b7903ae30c6b1a0cbd6c09555 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Sat, 7 Nov 2020 14:19:01 -0800 Subject: [PATCH 12/14] Fix for lowercase letters in peptide sequence. --- mokapot/picked_protein.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mokapot/picked_protein.py b/mokapot/picked_protein.py index 81aaae25..85e73b0c 100644 --- a/mokapot/picked_protein.py +++ b/mokapot/picked_protein.py @@ -48,6 +48,14 @@ def picked_protein( .str.replace(r"\..*?$", "") ) + # Sometimes folks use lowercase letters for the termini or mods: + if all(prots["stripped sequence"].str.islower()): + seqs = prots["stripped sequence"].upper() + else: + seqs = prots["stripped sequence"].str.replace(r"[a-z]", "") + + prots["stripped sequence"] = seqs + # There are two cases we need to deal with: # 1. The fasta contained both targets and decoys (ideal) # 2. The fasta contained only targets (less ideal) From dc11dc8a43f7b11de1da06126fee557fe137f36c Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Sat, 7 Nov 2020 14:28:09 -0800 Subject: [PATCH 13/14] Fixed error checking for decoys --- mokapot/picked_protein.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mokapot/picked_protein.py b/mokapot/picked_protein.py index 85e73b0c..ba0b81f5 100644 --- a/mokapot/picked_protein.py +++ b/mokapot/picked_protein.py @@ -73,16 +73,6 @@ def picked_protein( if not proteins.has_decoys: unmatched[~prots[target_column]] = False - # Verify that reasonable number of decoys were matched. - if proteins.has_decoys: - num_unmatched_decoys = unmatched[~prots[target_column]].sum() - total_decoys = (~prots[target_column]).sum() - if num_unmatched_decoys / total_decoys > 0.05: - raise ValueError( - "Fewer than 5% of decoy peptides could be mapped to proteins." - " Was the correct FASTA file and digest settings used?" - ) - unmatched_prots = prots.loc[unmatched, :] shared = unmatched_prots["stripped sequence"].isin( proteins.shared_peptides @@ -97,6 +87,7 @@ def picked_protein( ) if shared_unmatched: + LOGGER.debug("%s", unmatched_prots.loc[~shared, "stripped sequence"]) if shared_unmatched / len(prots) > 0.10: raise ValueError( "Fewer than 90% of all peptides could be matched to proteins. " @@ -109,7 +100,16 @@ def picked_protein( shared_unmatched, len(prots), ) - LOGGER.debug("%s", unmatched_prots.loc[~shared, "stripped sequence"]) + + # Verify that reasonable number of decoys were matched. + if proteins.has_decoys: + num_unmatched_decoys = unmatched_prots[target_column][~shared].sum() + total_decoys = (~prots[target_column]).sum() + if num_unmatched_decoys / total_decoys > 0.05: + raise ValueError( + "Fewer than 5% of decoy peptides could be mapped to proteins." + " Was the correct FASTA file and digest settings used?" + ) prots = prots.loc[~unmatched, :] prots["decoy"] = ( From 21f6eb55fdb9f344fcf3a25ff14d5cb2458bb2e0 Mon Sep 17 00:00:00 2001 From: William Fondrie Date: Mon, 9 Nov 2020 01:20:58 -0800 Subject: [PATCH 14/14] Don't calibrate scores if using probabilities. --- mokapot/brew.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mokapot/brew.py b/mokapot/brew.py index 296b0805..e1cbee29 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -208,9 +208,15 @@ def _predict(dset, test_idx, models, test_fdr): scores = [] for fold_idx, mod in zip(test_idx, models): test_set._data = dset.data.loc[list(fold_idx), :] - scores.append( - test_set._calibrate_scores(mod.predict(test_set), test_fdr) - ) + + # Don't calibrate if using predict_proba. + try: + mod.estimator.decision_function + scores.append( + test_set._calibrate_scores(mod.predict(test_set), test_fdr) + ) + except AttributeError: + scores.append(mod.predict(test_set)) rev_idx = np.argsort(sum(test_idx, [])).tolist() return np.concatenate(scores)[rev_idx]