diff --git a/mokapot/brew.py b/mokapot/brew.py index 296b0805..e1cbee29 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -208,9 +208,15 @@ def _predict(dset, test_idx, models, test_fdr): scores = [] for fold_idx, mod in zip(test_idx, models): test_set._data = dset.data.loc[list(fold_idx), :] - scores.append( - test_set._calibrate_scores(mod.predict(test_set), test_fdr) - ) + + # Don't calibrate if using predict_proba. + try: + mod.estimator.decision_function + scores.append( + test_set._calibrate_scores(mod.predict(test_set), test_fdr) + ) + except AttributeError: + scores.append(mod.predict(test_set)) rev_idx = np.argsort(sum(test_idx, [])).tolist() return np.concatenate(scores)[rev_idx] diff --git a/mokapot/confidence.py b/mokapot/confidence.py index 3760f35b..a3a084ad 100644 --- a/mokapot/confidence.py +++ b/mokapot/confidence.py @@ -1,8 +1,7 @@ """ One of the primary purposes of mokapot is to assign confidence estimates to PSMs. This task is accomplished by ranking PSMs according to a score or metric and -using an appropriate confidence estimation procedure for the type of data -(currently, linear and cross-linked PSMs are supported). In either case, +using an appropriate confidence estimation procedure for the type of data. mokapot can provide confidence estimates based any score, regardless of whether it was the result of a learned :py:func:`mokapot.model.Model` instance or provided independently. @@ -11,10 +10,10 @@ provided score. In either case, they provide utilities to access, save, and plot these estimates for the various relevant levels (i.e. PSMs, peptides, and proteins). The :py:func:`LinearConfidence` class is appropriate for most -proteomics datasets, whereas the :py:func:`CrossLinkedConfidence` is -specifically designed for crosslinked peptides. +proteomics datasets. """ import os +import copy import logging import pandas as pd @@ -29,6 +28,111 @@ # Classes --------------------------------------------------------------------- +class GroupedConfidence: + """ + Performed grouped confidence estimation for a collection of PSMs. + + Parameters + ---------- + psms : LinearPsmDataset object + A collection of PSMs. + scores : np.ndarray + A vector containing the score of each PSM. + desc : bool + Are higher scores better? + eval_fdr : float + The FDR threshold at which to report performance. This parameter + has no affect on the analysis itself, only logging messages. + """ + + def __init__(self, psms, scores, desc=True, eval_fdr=0.01): + """Initialize a GroupedConfidence object""" + group_psms = copy.copy(psms) + self.group_column = group_psms._group_column + group_psms._group_column = None + scores = scores * (desc * 2 - 1) + + # Do TDC + scores = ( + pd.Series(scores, index=psms._data.index) + .sample(frac=1) + .sort_values() + ) + + idx = ( + psms.data.loc[scores.index, :] + .drop_duplicates(psms._spectrum_columns, keep="last") + .index + ) + + self.group_confidence_estimates = {} + for group, group_df in psms._data.groupby(psms._group_column): + LOGGER.info("Group: %s == %s", self.group_column, group) + group_psms._data = None + tdc_winners = group_df.index.intersection(idx) + group_psms._data = group_df.loc[tdc_winners, :] + group_scores = scores.loc[group_psms._data.index].values + 1 + res = group_psms.assign_confidence( + group_scores * (2 * desc - 1), desc=desc, eval_fdr=eval_fdr + ) + self.group_confidence_estimates[group] = res + + def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False): + """ + Save confidence estimates to delimited text files. + + Parameters + ---------- + dest_dir : str or None, optional + The directory in which to save the files. `None` will use the + current working directory. + file_root : str or None, optional + An optional prefix for the confidence estimate files. The + suffix will always be `mokapot.psms.txt` and + `mokapot.peptides.txt`. + sep : str, optional + The delimiter to use. + decoys : bool, optional + Save decoys confidence estimates as well? + + Returns + ------- + list of str + The paths to the saved files. + """ + ret_files = [] + for group, res in self.group_confidence_estimates.items(): + prefix = file_root + f".{group}" + new_files = res.to_txt( + dest_dir=dest_dir, file_root=prefix, sep=sep, decoys=decoys + ) + ret_files.append(new_files) + + return ret_files + + def __repr__(self): + """Nice printing.""" + ngroups = len(self.group_confidence_estimates) + lines = [ + "A mokapot.confidence.GroupedConfidence object with " + f"{ngroups} groups:\n" + ] + + for group, conf in self.group_confidence_estimates.items(): + lines += [f"Group: {self.group_column} == {group}"] + lines += ["-" * len(lines[-1])] + lines += [str(conf)] + + return "\n".join(lines) + + def __getattr__(self, attr): + """Make groups accessible easily""" + try: + return self.grouped_confidence_estimates[attr] + except KeyError: + raise AttributeError + + class Confidence: """ Estimate and store the statistical confidence for a collection of @@ -90,9 +194,9 @@ def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False): An optional prefix for the confidence estimate files. The suffix will always be `mokapot.psms.txt` and `mokapot.peptides.txt`. - sep : str + sep : str, optional The delimiter to use. - decoys : bool + decoys : bool, optional Save decoys confidence estimates as well? Returns @@ -212,7 +316,6 @@ class LinearConfidence(Confidence): def __init__(self, psms, scores, desc=True, eval_fdr=0.01): """Initialize a a LinearPsmConfidence object""" - LOGGER.info("=== Assigning Confidence ===") super().__init__(psms, scores, desc) self._target_column = _new_column("target", self._data) self._data[self._target_column] = psms.targets @@ -283,7 +386,7 @@ def _assign_confidence(self, desc=True): proteins = picked_protein( peptides, self._target_column, - self._peptide_column[0], + self._peptide_column, self._score_column, self._proteins, ) @@ -297,6 +400,11 @@ def _assign_confidence(self, desc=True): ).reset_index(drop=True) scores = data.loc[:, self._score_column].values targets = data.loc[:, self._target_column].astype(bool).values + if all(targets): + LOGGER.warning( + "No decoy PSMs remain for confidence estimation. " + "Confidence estimates may be unreliable." + ) # Estimate q-values and assign to dataframe LOGGER.info("Assiging q-values to %s...", level) @@ -320,9 +428,16 @@ def _assign_confidence(self, desc=True): # Calculate PEPs LOGGER.info("Assiging PEPs to %s...", level) - _, pep = qvality.getQvaluesFromScores( - scores[targets], scores[~targets], includeDecoys=True - ) + try: + _, pep = qvality.getQvaluesFromScores( + scores[targets], scores[~targets], includeDecoys=True + ) + except SystemExit as msg: + print(msg) + if "no decoy hits available for PEP calculation" in str(msg): + pep = 0 + else: + raise level = level.lower() data["mokapot PEP"] = pep diff --git a/mokapot/dataset.py b/mokapot/dataset.py index be3f78f1..59be3b3c 100644 --- a/mokapot/dataset.py +++ b/mokapot/dataset.py @@ -2,10 +2,7 @@ The :py:class:`LinearPsmDataset` and :py:class:`CrossLinkedPsmDataset` classes are used to define collections peptide-spectrum matches. The :py:class:`LinearPsmDataset` class is suitable for most types of -data-dependent acquisition proteomics experiments, whereas the -:py:class:`CrossLinkedPsmDataset` is specifically designed for -collections of cross-linked PSMs (CSMs) originating from -cross-linking proteomics experiments. +data-dependent acquisition proteomics experiments. Although either class can be constructed from a :py:class:`pandas.DataFrame`, it is often easier to load the PSMs directly @@ -27,8 +24,12 @@ from . import qvalues from . import utils from .proteins import read_fasta -from .confidence import LinearConfidence, CrossLinkedConfidence from .proteins import FastaProteins +from .confidence import ( + LinearConfidence, + CrossLinkedConfidence, + GroupedConfidence, +) LOGGER = logging.getLogger(__name__) @@ -90,7 +91,13 @@ def _update_labels(self, scores, eval_fdr, desc): return def __init__( - self, psms, spectrum_columns, feature_columns, other_columns, copy_data + self, + psms, + spectrum_columns, + feature_columns, + group_column, + other_columns, + copy_data, ): """Initialize an object""" self._data = psms.copy(deep=copy_data).reset_index(drop=True) @@ -98,14 +105,22 @@ def __init__( # Set columns self._spectrum_columns = utils.tuplize(spectrum_columns) + self._group_column = group_column if other_columns is not None: other_columns = utils.tuplize(other_columns) else: other_columns = () + if group_column is not None: + group_column = (group_column,) + else: + group_column = () + # Check that all of the columns exist: - used_columns = sum([other_columns, self._spectrum_columns], tuple()) + used_columns = sum( + [other_columns, self._spectrum_columns, group_column], tuple() + ) missing_columns = [c not in self.data.columns for c in used_columns] if not missing_columns: @@ -162,6 +177,13 @@ def spectra(self): """ return self.data.loc[:, self._spectrum_columns] + @property + def groups(self): + """ + A :py:class:`pandas.Series` of the groups for confidence estimation. + """ + return self.data.loc[:, self._group_column] + @property def columns(self): """The columns of the dataset.""" @@ -336,7 +358,9 @@ class LinearPsmDataset(PsmDataset): protein_column : str, optional The column that specifies which protein(s) the detected peptide might have originated from. This column is not used to compute - protein-level confidence estimates (see :py:meth:`add_fasta()`). + protein-level confidence estimates (see :py:meth:`add_proteins()`). + group_column : str, optional + A factor by which to group PSMs for grouped confidence estimation. feature_columns : str or tuple of str, optional The column(s) specifying the feature(s) for mokapot analysis. If `None`, these are assumed to be all columns not specified in the @@ -353,7 +377,8 @@ class LinearPsmDataset(PsmDataset): metadata : pandas.DataFrame features : pandas.DataFrame spectra : pandas.DataFrame - peptides : pandas.DataFrame + peptides : pandas.Series + groups : pandas.Series targets : numpy.ndarray columns : list of str has_proteins : bool @@ -365,39 +390,26 @@ def __init__( target_column, spectrum_columns, peptide_column, - protein_column, + protein_column=None, + group_column=None, feature_columns=None, copy_data=True, ): """Initialize a PsmDataset object.""" self._target_column = target_column - self._peptide_column = utils.tuplize(peptide_column) - - if protein_column is not None: - self._protein_column = utils.tuplize(protein_column) - else: - self._protein_column = tuple() - - # Some error checking: - if len(self._peptide_column) > 1: - raise ValueError( - "Only one column can be used for " "'peptide_column'" - ) + self._peptide_column = peptide_column + self._protein_column = protein_column # Finish initialization - other_columns = sum( - [ - utils.tuplize(self._target_column), - self._peptide_column, - self._protein_column, - ], - tuple(), - ) + other_columns = [target_column, peptide_column] + if protein_column is not None: + other_columns += [protein_column] super().__init__( psms=psms, spectrum_columns=spectrum_columns, feature_columns=feature_columns, + group_column=group_column, other_columns=other_columns, copy_data=copy_data, ) @@ -438,7 +450,7 @@ def targets(self): @property def peptides(self): - """A :py:class:`pandas.DataFrame` of the peptide column.""" + """A :py:class:`pandas.Series` of the peptide column.""" return self.data.loc[:, self._peptide_column] def _update_labels(self, scores, eval_fdr=0.01, desc=True): @@ -509,7 +521,14 @@ def assign_confidence(self, scores=None, desc=True, eval_fdr=0.01): LOGGER.info("Selected %s as the best feature.", feat) scores = self.features[feat].values - return LinearConfidence(self, scores, eval_fdr=eval_fdr, desc=desc) + if self._group_column is None: + LOGGER.info("Assigning confidence...") + return LinearConfidence(self, scores, eval_fdr=eval_fdr, desc=desc) + else: + LOGGER.info("Assigning confidence within groups...") + return GroupedConfidence( + self, scores, eval_fdr=eval_fdr, desc=desc + ) class CrossLinkedPsmDataset(PsmDataset): diff --git a/mokapot/model.py b/mokapot/model.py index 93e0fbde..8fc20193 100644 --- a/mokapot/model.py +++ b/mokapot/model.py @@ -618,6 +618,8 @@ def _find_hyperparameters(model, features, labels): new_est = model.estimator.estimator new_est.set_params(**best_params) model._needs_cv = False + for param, value in best_params.items(): + LOGGER.info("\t- %s = %s", param, value) else: new_est = model.estimator diff --git a/mokapot/parsers.py b/mokapot/parsers.py index 542c3052..bf2da534 100644 --- a/mokapot/parsers.py +++ b/mokapot/parsers.py @@ -14,7 +14,7 @@ LOGGER = logging.getLogger(__name__) # Functions ------------------------------------------------------------------- -def read_pin(pin_files, to_df=False): +def read_pin(pin_files, group_column=None, to_df=False, copy_data=False): """ Read Percolator input (PIN) tab-delimited files. @@ -27,21 +27,33 @@ def read_pin(pin_files, to_df=False): Specifically, mokapot requires specific columns in the tab-delmited files: `specid`, `scannr`, `peptide`, `proteins`, and - `label`. Note that these column names are insensitive. In addition - to the required columns, mokapot will look for an `expmass` column, - which is generated by `Crux `_, but is not - intended to be a feature. + `label`. Note that these column names are case insensitive. In + addition to the required columns, mokapot will look for an `expmass` + and `calcmass` columns, which are generated by + `Crux `_, but are not intended to be features. - Additionally, mokapot does not currently support specifying a + In addition to PIN tab-delimited files, the `pin_files` argument + can a :py:class:`pandas.DataFrame` containing the above columns. + + Finally, mokapot does not currently support specifying a default direction or feature weights in the PIN file itself. Parameters ---------- - pin_files : str or tuple of str - One or more PIN files to read. - to_df : bool + pin_files : str, tuple of str, or pandas.DataFrame + One or more PIN files to read or a :py:class:`pandas.DataFrame`. + group_column : str, optional + A factor to by which to group PSMs for grouped confidence + estimation. + to_df : bool, optional Return a :py:class:`pandas.DataFrame` instead of a py:class:`~mokapot.dataset.LinearPsmDataset`. + copy_data : bool, optional + If true, a deep copy of the data is created. This + uses more memory, but is safer because it prevents + accidental modification of the underlying data. This + argument only has an effect when `pin_files` is a + :py:class:`pandas.DataFrame` Returns ------- @@ -50,30 +62,32 @@ def read_pin(pin_files, to_df=False): containing the PSMs from all of the PIN files. """ logging.info("Parsing PSMs...") - pin_df = pd.concat([read_percolator(f) for f in utils.tuplize(pin_files)]) - # Find all of the necessary columns, case-insensitive: - specid = tuple(c for c in pin_df.columns if c.lower() == "specid") - peptides = tuple(c for c in pin_df.columns if c.lower() == "peptide") - proteins = tuple(c for c in pin_df.columns if c.lower() == "proteins") - labels = tuple(c for c in pin_df.columns if c.lower() == "label") - other = tuple(c for c in pin_df.columns if c.lower() == "calcmass") - spectra = tuple( - c for c in pin_df.columns if c.lower() in ["scannr", "expmass"] - ) - - nonfeat = sum( - [specid, spectra, peptides, proteins, labels, other], tuple() - ) + if isinstance(pin_files, pd.DataFrame): + pin_df = pin_files.copy(deep=copy_data) + else: + pin_df = pd.concat( + [read_percolator(f) for f in utils.tuplize(pin_files)] + ) - features = tuple(c for c in pin_df.columns if c not in nonfeat) + # Find all of the necessary columns, case-insensitive: + specid = [c for c in pin_df.columns if c.lower() == "specid"] + peptides = [c for c in pin_df.columns if c.lower() == "peptide"] + proteins = [c for c in pin_df.columns if c.lower() == "proteins"] + labels = [c for c in pin_df.columns if c.lower() == "label"] + other = [c for c in pin_df.columns if c.lower() == "calcmass"] + spectra = [c for c in pin_df.columns if c.lower() in ["scannr", "expmass"]] + nonfeat = sum([specid, spectra, peptides, proteins, labels, other], []) + if group_column is not None: + nonfeat += [group_column] + + features = [c for c in pin_df.columns if c not in nonfeat] # Check for errors: - if len(labels) > 1: - raise ValueError("More than one label column found in pin file.") - - if len(proteins) > 1: - raise ValueError("More than one protein column found in pin file.") + col_names = ["Label", "Peptide", "Proteins"] + for col, name in zip([labels, peptides, proteins], col_names): + if len(col) > 1: + raise ValueError(f"More than one '{name}' column found.") if not all([specid, peptides, proteins, labels, spectra]): raise ValueError( @@ -82,7 +96,8 @@ def read_pin(pin_files, to_df=False): ) # Convert labels to the correct format. - pin_df[labels[0]] = (pin_df[labels[0]] + 1) / 2 + if any(pin_df[labels[0]] == -1): + pin_df[labels[0]] = (pin_df[labels[0]] + 1) / 2 if to_df: return pin_df @@ -91,8 +106,9 @@ def read_pin(pin_files, to_df=False): psms=pin_df, target_column=labels[0], spectrum_columns=spectra, - peptide_column=peptides, + peptide_column=peptides[0], protein_column=proteins[0], + group_column=group_column, feature_columns=features, copy_data=False, ) diff --git a/mokapot/picked_protein.py b/mokapot/picked_protein.py index d11e0e2e..ba0b81f5 100644 --- a/mokapot/picked_protein.py +++ b/mokapot/picked_protein.py @@ -48,6 +48,14 @@ def picked_protein( .str.replace(r"\..*?$", "") ) + # Sometimes folks use lowercase letters for the termini or mods: + if all(prots["stripped sequence"].str.islower()): + seqs = prots["stripped sequence"].upper() + else: + seqs = prots["stripped sequence"].str.replace(r"[a-z]", "") + + prots["stripped sequence"] = seqs + # There are two cases we need to deal with: # 1. The fasta contained both targets and decoys (ideal) # 2. The fasta contained only targets (less ideal) @@ -65,16 +73,6 @@ def picked_protein( if not proteins.has_decoys: unmatched[~prots[target_column]] = False - # Verify that reasonable number of decoys were matched. - if proteins.has_decoys: - num_unmatched_decoys = unmatched[~prots[target_column]].sum() - total_decoys = (~prots[target_column]).sum() - if num_unmatched_decoys / total_decoys > 0.05: - raise ValueError( - "Fewer than 5% of decoy peptides could be mapped to proteins." - " Was the correct FASTA file and digest settings used?" - ) - unmatched_prots = prots.loc[unmatched, :] shared = unmatched_prots["stripped sequence"].isin( proteins.shared_peptides @@ -89,6 +87,7 @@ def picked_protein( ) if shared_unmatched: + LOGGER.debug("%s", unmatched_prots.loc[~shared, "stripped sequence"]) if shared_unmatched / len(prots) > 0.10: raise ValueError( "Fewer than 90% of all peptides could be matched to proteins. " @@ -102,6 +101,16 @@ def picked_protein( len(prots), ) + # Verify that reasonable number of decoys were matched. + if proteins.has_decoys: + num_unmatched_decoys = unmatched_prots[target_column][~shared].sum() + total_decoys = (~prots[target_column]).sum() + if num_unmatched_decoys / total_decoys > 0.05: + raise ValueError( + "Fewer than 5% of decoy peptides could be mapped to proteins." + " Was the correct FASTA file and digest settings used?" + ) + prots = prots.loc[~unmatched, :] prots["decoy"] = ( prots["mokapot protein group"] diff --git a/mokapot/utils.py b/mokapot/utils.py index 00068c9a..6d1ae6b6 100644 --- a/mokapot/utils.py +++ b/mokapot/utils.py @@ -9,6 +9,7 @@ def groupby_max(df, by_cols, max_col): """Quickly get the indices for the maximum value of col""" + by_cols = tuplize(by_cols) idx = ( df.sample(frac=1) .sort_values(list(by_cols) + [max_col], axis=0) @@ -67,7 +68,5 @@ def tuplize(obj): else: if isinstance(obj, str): obj = (obj,) - else: - tuple(obj) - return obj + return tuple(obj) diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 59c7617b..7967f83d 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -45,7 +45,7 @@ def test_linear_init(): metadata = ["target", "spectrum", "peptide", "protein"] pd.testing.assert_frame_equal(dset.spectra, DAT1.loc[:, ["spectrum"]]) - pd.testing.assert_frame_equal(dset.peptides, DAT1.loc[:, ["peptide"]]) + pd.testing.assert_series_equal(dset.peptides, DAT1.loc[:, "peptide"]) pd.testing.assert_frame_equal(dset.features, DAT1.loc[:, features]) pd.testing.assert_frame_equal(dset.metadata, DAT1.loc[:, metadata]) assert dset.columns == DAT1.columns.tolist() @@ -66,6 +66,7 @@ def test_update_labels(): spectrum_columns="spectrum", peptide_column="peptide", protein_column="protein", + group_column=None, feature_columns=None, copy_data=True, )