Skip to content

Commit

Permalink
Merge pull request #2 from wfondrie/grouping
Browse files Browse the repository at this point in the history
Grouping
  • Loading branch information
wfondrie authored Nov 10, 2020
2 parents 317c2cd + 21f6eb5 commit c0bc3a9
Show file tree
Hide file tree
Showing 8 changed files with 258 additions and 91 deletions.
12 changes: 9 additions & 3 deletions mokapot/brew.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,9 +208,15 @@ def _predict(dset, test_idx, models, test_fdr):
scores = []
for fold_idx, mod in zip(test_idx, models):
test_set._data = dset.data.loc[list(fold_idx), :]
scores.append(
test_set._calibrate_scores(mod.predict(test_set), test_fdr)
)

# Don't calibrate if using predict_proba.
try:
mod.estimator.decision_function
scores.append(
test_set._calibrate_scores(mod.predict(test_set), test_fdr)
)
except AttributeError:
scores.append(mod.predict(test_set))

rev_idx = np.argsort(sum(test_idx, [])).tolist()
return np.concatenate(scores)[rev_idx]
Expand Down
137 changes: 126 additions & 11 deletions mokapot/confidence.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
"""
One of the primary purposes of mokapot is to assign confidence estimates to PSMs.
This task is accomplished by ranking PSMs according to a score or metric and
using an appropriate confidence estimation procedure for the type of data
(currently, linear and cross-linked PSMs are supported). In either case,
using an appropriate confidence estimation procedure for the type of data.
mokapot can provide confidence estimates based any score, regardless of
whether it was the result of a learned :py:func:`mokapot.model.Model`
instance or provided independently.
Expand All @@ -11,10 +10,10 @@
provided score. In either case, they provide utilities to access, save, and
plot these estimates for the various relevant levels (i.e. PSMs, peptides, and
proteins). The :py:func:`LinearConfidence` class is appropriate for most
proteomics datasets, whereas the :py:func:`CrossLinkedConfidence` is
specifically designed for crosslinked peptides.
proteomics datasets.
"""
import os
import copy
import logging

import pandas as pd
Expand All @@ -29,6 +28,111 @@


# Classes ---------------------------------------------------------------------
class GroupedConfidence:
"""
Performed grouped confidence estimation for a collection of PSMs.
Parameters
----------
psms : LinearPsmDataset object
A collection of PSMs.
scores : np.ndarray
A vector containing the score of each PSM.
desc : bool
Are higher scores better?
eval_fdr : float
The FDR threshold at which to report performance. This parameter
has no affect on the analysis itself, only logging messages.
"""

def __init__(self, psms, scores, desc=True, eval_fdr=0.01):
"""Initialize a GroupedConfidence object"""
group_psms = copy.copy(psms)
self.group_column = group_psms._group_column
group_psms._group_column = None
scores = scores * (desc * 2 - 1)

# Do TDC
scores = (
pd.Series(scores, index=psms._data.index)
.sample(frac=1)
.sort_values()
)

idx = (
psms.data.loc[scores.index, :]
.drop_duplicates(psms._spectrum_columns, keep="last")
.index
)

self.group_confidence_estimates = {}
for group, group_df in psms._data.groupby(psms._group_column):
LOGGER.info("Group: %s == %s", self.group_column, group)
group_psms._data = None
tdc_winners = group_df.index.intersection(idx)
group_psms._data = group_df.loc[tdc_winners, :]
group_scores = scores.loc[group_psms._data.index].values + 1
res = group_psms.assign_confidence(
group_scores * (2 * desc - 1), desc=desc, eval_fdr=eval_fdr
)
self.group_confidence_estimates[group] = res

def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False):
"""
Save confidence estimates to delimited text files.
Parameters
----------
dest_dir : str or None, optional
The directory in which to save the files. `None` will use the
current working directory.
file_root : str or None, optional
An optional prefix for the confidence estimate files. The
suffix will always be `mokapot.psms.txt` and
`mokapot.peptides.txt`.
sep : str, optional
The delimiter to use.
decoys : bool, optional
Save decoys confidence estimates as well?
Returns
-------
list of str
The paths to the saved files.
"""
ret_files = []
for group, res in self.group_confidence_estimates.items():
prefix = file_root + f".{group}"
new_files = res.to_txt(
dest_dir=dest_dir, file_root=prefix, sep=sep, decoys=decoys
)
ret_files.append(new_files)

return ret_files

def __repr__(self):
"""Nice printing."""
ngroups = len(self.group_confidence_estimates)
lines = [
"A mokapot.confidence.GroupedConfidence object with "
f"{ngroups} groups:\n"
]

for group, conf in self.group_confidence_estimates.items():
lines += [f"Group: {self.group_column} == {group}"]
lines += ["-" * len(lines[-1])]
lines += [str(conf)]

return "\n".join(lines)

def __getattr__(self, attr):
"""Make groups accessible easily"""
try:
return self.grouped_confidence_estimates[attr]
except KeyError:
raise AttributeError


class Confidence:
"""
Estimate and store the statistical confidence for a collection of
Expand Down Expand Up @@ -90,9 +194,9 @@ def to_txt(self, dest_dir=None, file_root=None, sep="\t", decoys=False):
An optional prefix for the confidence estimate files. The
suffix will always be `mokapot.psms.txt` and
`mokapot.peptides.txt`.
sep : str
sep : str, optional
The delimiter to use.
decoys : bool
decoys : bool, optional
Save decoys confidence estimates as well?
Returns
Expand Down Expand Up @@ -212,7 +316,6 @@ class LinearConfidence(Confidence):

def __init__(self, psms, scores, desc=True, eval_fdr=0.01):
"""Initialize a a LinearPsmConfidence object"""
LOGGER.info("=== Assigning Confidence ===")
super().__init__(psms, scores, desc)
self._target_column = _new_column("target", self._data)
self._data[self._target_column] = psms.targets
Expand Down Expand Up @@ -283,7 +386,7 @@ def _assign_confidence(self, desc=True):
proteins = picked_protein(
peptides,
self._target_column,
self._peptide_column[0],
self._peptide_column,
self._score_column,
self._proteins,
)
Expand All @@ -297,6 +400,11 @@ def _assign_confidence(self, desc=True):
).reset_index(drop=True)
scores = data.loc[:, self._score_column].values
targets = data.loc[:, self._target_column].astype(bool).values
if all(targets):
LOGGER.warning(
"No decoy PSMs remain for confidence estimation. "
"Confidence estimates may be unreliable."
)

# Estimate q-values and assign to dataframe
LOGGER.info("Assiging q-values to %s...", level)
Expand All @@ -320,9 +428,16 @@ def _assign_confidence(self, desc=True):

# Calculate PEPs
LOGGER.info("Assiging PEPs to %s...", level)
_, pep = qvality.getQvaluesFromScores(
scores[targets], scores[~targets], includeDecoys=True
)
try:
_, pep = qvality.getQvaluesFromScores(
scores[targets], scores[~targets], includeDecoys=True
)
except SystemExit as msg:
print(msg)
if "no decoy hits available for PEP calculation" in str(msg):
pep = 0
else:
raise

level = level.lower()
data["mokapot PEP"] = pep
Expand Down
83 changes: 51 additions & 32 deletions mokapot/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,7 @@
The :py:class:`LinearPsmDataset` and :py:class:`CrossLinkedPsmDataset`
classes are used to define collections peptide-spectrum matches. The
:py:class:`LinearPsmDataset` class is suitable for most types of
data-dependent acquisition proteomics experiments, whereas the
:py:class:`CrossLinkedPsmDataset` is specifically designed for
collections of cross-linked PSMs (CSMs) originating from
cross-linking proteomics experiments.
data-dependent acquisition proteomics experiments.
Although either class can be constructed from a
:py:class:`pandas.DataFrame`, it is often easier to load the PSMs directly
Expand All @@ -27,8 +24,12 @@
from . import qvalues
from . import utils
from .proteins import read_fasta
from .confidence import LinearConfidence, CrossLinkedConfidence
from .proteins import FastaProteins
from .confidence import (
LinearConfidence,
CrossLinkedConfidence,
GroupedConfidence,
)

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -90,22 +91,36 @@ def _update_labels(self, scores, eval_fdr, desc):
return

def __init__(
self, psms, spectrum_columns, feature_columns, other_columns, copy_data
self,
psms,
spectrum_columns,
feature_columns,
group_column,
other_columns,
copy_data,
):
"""Initialize an object"""
self._data = psms.copy(deep=copy_data).reset_index(drop=True)
self._proteins = None

# Set columns
self._spectrum_columns = utils.tuplize(spectrum_columns)
self._group_column = group_column

if other_columns is not None:
other_columns = utils.tuplize(other_columns)
else:
other_columns = ()

if group_column is not None:
group_column = (group_column,)
else:
group_column = ()

# Check that all of the columns exist:
used_columns = sum([other_columns, self._spectrum_columns], tuple())
used_columns = sum(
[other_columns, self._spectrum_columns, group_column], tuple()
)

missing_columns = [c not in self.data.columns for c in used_columns]
if not missing_columns:
Expand Down Expand Up @@ -162,6 +177,13 @@ def spectra(self):
"""
return self.data.loc[:, self._spectrum_columns]

@property
def groups(self):
"""
A :py:class:`pandas.Series` of the groups for confidence estimation.
"""
return self.data.loc[:, self._group_column]

@property
def columns(self):
"""The columns of the dataset."""
Expand Down Expand Up @@ -336,7 +358,9 @@ class LinearPsmDataset(PsmDataset):
protein_column : str, optional
The column that specifies which protein(s) the detected peptide
might have originated from. This column is not used to compute
protein-level confidence estimates (see :py:meth:`add_fasta()`).
protein-level confidence estimates (see :py:meth:`add_proteins()`).
group_column : str, optional
A factor by which to group PSMs for grouped confidence estimation.
feature_columns : str or tuple of str, optional
The column(s) specifying the feature(s) for mokapot analysis. If
`None`, these are assumed to be all columns not specified in the
Expand All @@ -353,7 +377,8 @@ class LinearPsmDataset(PsmDataset):
metadata : pandas.DataFrame
features : pandas.DataFrame
spectra : pandas.DataFrame
peptides : pandas.DataFrame
peptides : pandas.Series
groups : pandas.Series
targets : numpy.ndarray
columns : list of str
has_proteins : bool
Expand All @@ -365,39 +390,26 @@ def __init__(
target_column,
spectrum_columns,
peptide_column,
protein_column,
protein_column=None,
group_column=None,
feature_columns=None,
copy_data=True,
):
"""Initialize a PsmDataset object."""
self._target_column = target_column
self._peptide_column = utils.tuplize(peptide_column)

if protein_column is not None:
self._protein_column = utils.tuplize(protein_column)
else:
self._protein_column = tuple()

# Some error checking:
if len(self._peptide_column) > 1:
raise ValueError(
"Only one column can be used for " "'peptide_column'"
)
self._peptide_column = peptide_column
self._protein_column = protein_column

# Finish initialization
other_columns = sum(
[
utils.tuplize(self._target_column),
self._peptide_column,
self._protein_column,
],
tuple(),
)
other_columns = [target_column, peptide_column]
if protein_column is not None:
other_columns += [protein_column]

super().__init__(
psms=psms,
spectrum_columns=spectrum_columns,
feature_columns=feature_columns,
group_column=group_column,
other_columns=other_columns,
copy_data=copy_data,
)
Expand Down Expand Up @@ -438,7 +450,7 @@ def targets(self):

@property
def peptides(self):
"""A :py:class:`pandas.DataFrame` of the peptide column."""
"""A :py:class:`pandas.Series` of the peptide column."""
return self.data.loc[:, self._peptide_column]

def _update_labels(self, scores, eval_fdr=0.01, desc=True):
Expand Down Expand Up @@ -509,7 +521,14 @@ def assign_confidence(self, scores=None, desc=True, eval_fdr=0.01):
LOGGER.info("Selected %s as the best feature.", feat)
scores = self.features[feat].values

return LinearConfidence(self, scores, eval_fdr=eval_fdr, desc=desc)
if self._group_column is None:
LOGGER.info("Assigning confidence...")
return LinearConfidence(self, scores, eval_fdr=eval_fdr, desc=desc)
else:
LOGGER.info("Assigning confidence within groups...")
return GroupedConfidence(
self, scores, eval_fdr=eval_fdr, desc=desc
)


class CrossLinkedPsmDataset(PsmDataset):
Expand Down
2 changes: 2 additions & 0 deletions mokapot/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,8 @@ def _find_hyperparameters(model, features, labels):
new_est = model.estimator.estimator
new_est.set_params(**best_params)
model._needs_cv = False
for param, value in best_params.items():
LOGGER.info("\t- %s = %s", param, value)
else:
new_est = model.estimator

Expand Down
Loading

0 comments on commit c0bc3a9

Please sign in to comment.