Skip to content

Commit

Permalink
✨ cherry picks internal fixes from !68 and !70
Browse files Browse the repository at this point in the history
  • Loading branch information
gessulat committed Sep 19, 2024
1 parent 6b8fbfd commit c82b1e0
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 221 deletions.
8 changes: 7 additions & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,17 @@ publish:
- python -m build --sdist --wheel .
- TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python -m twine upload --repository-url https://gitlab.com/api/v4/projects/${CI_PROJECT_ID}/packages/pypi dist/*

check_formatting:
extends: .with_twine
stage: test
script:
- pip install .[dev]
- ruff check . --exclude docs/

unit_test:
extends: .with_twine
stage: test
script:
- pip install .[dev]
- pip install pytest
- pytest tests/

52 changes: 27 additions & 25 deletions mokapot/confidence.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,35 +19,35 @@
from __future__ import annotations

import logging
from pathlib import Path
import os
from contextlib import contextmanager
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
from typeguard import typechecked
import os

from . import qvalues
from .peps import peps_from_scores
from .confidence_writer import write_confidences
from .constants import CONFIDENCE_CHUNK_SIZE
from .dataset import OnDiskPsmDataset
from .peps import peps_from_scores, PepsConvergenceError
from .picked_protein import picked_protein
from .tabular_data import (
TabularDataWriter,
TabularDataReader,
get_score_column_type,
)
from .utils import (
create_chunks,
groupby_max,
convert_targets_column,
merge_sort,
get_dataframe_from_records,
)
from .dataset import OnDiskPsmDataset
from .picked_protein import picked_protein
from .writers import to_flashlfq
from .tabular_data import (
TabularDataWriter,
TabularDataReader,
get_score_column_type,
)
from .confidence_writer import write_confidences
from .constants import CONFIDENCE_CHUNK_SIZE

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -433,11 +433,15 @@ def _assign_confidence(
self.peps = peps_from_scores(
self.scores, self.targets, peps_algorithm
)
except SystemExit as msg:
if "no decoy hits available for PEP calculation" in str(msg):
self.peps = 0
else:
raise
except PepsConvergenceError:
LOGGER.info(
"\t- Encountered convergence problems in `nnls`. "
"Falling back to qvality ...",
)
self.peps = peps_from_scores(
self.scores, self.targets, "qvality"
)

if peps_error and all(self.peps == 1):
raise ValueError("PEP values are all equal to 1.")

Expand Down Expand Up @@ -488,7 +492,7 @@ def assign_confidence(
psms: list[OnDiskPsmDataset],
max_workers,
scores=None,
descs: list[bool] | None=None,
descs: list[bool] | None = None,
eval_fdr=0.01,
dest_dir: Path | None = None,
file_root: str = "",
Expand Down Expand Up @@ -721,12 +725,10 @@ def assign_confidence(
psm_count += 1
for level in levels:
if level != "psms" or deduplication:
psm_hash = str(
[
data_row.get(col)
for col in level_hash_columns[level]
]
)
psm_hash = str([
data_row.get(col)
for col in level_hash_columns[level]
])
if psm_hash in seen_level_entities[level]:
if level == "psms":
break
Expand Down
103 changes: 73 additions & 30 deletions mokapot/peps.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from triqler import qvality

from scipy.optimize import nnls
from triqler import qvality

PEP_ALGORITHM = {
"qvality": lambda scores, targets: peps_from_scores_qvality(
Expand All @@ -21,6 +20,12 @@
}


class PepsConvergenceError(Exception):
"""Raised when nnls iterations do not converge."""

pass


def peps_from_scores(scores, targets, pep_algorithm="qvality"):
"""
Compute PEPs from scores.
Expand Down Expand Up @@ -67,14 +72,23 @@ def peps_from_scores_qvality(scores, targets, use_binary=False):
else qvality.getQvaluesFromScores
)
old_verbosity, qvality.VERB = qvality.VERB, 0
_, peps = qvalues_from_scores(
scores[targets],
scores[~targets],
includeDecoys=True,
includePEPs=True,
tdcInput=False,
)
qvality.VERB = old_verbosity

try:
_, peps = qvalues_from_scores(
scores[targets],
scores[~targets],
includeDecoys=True,
includePEPs=True,
tdcInput=False,
)
except SystemExit as msg:
if "no decoy hits available for PEP calculation" in str(msg):
peps = 0
else:
raise
finally:
qvality.VERB = old_verbosity

return peps


Expand Down Expand Up @@ -248,7 +262,7 @@ def peps_from_scores_kde_nnls(
return peps


def fit_nnls(n, k, ascending=True, *, weight_exponent=1, erase_zeros=False):
def fit_nnls(n, k, ascending=True, *, weight_exponent=-1, erase_zeros=False):
"""
This method performs a non-negative least squares (NNLS) fit on given input parameters 'n' and 'k', such that
`k[i]` is close to `p[i] * n[i]` in a weighted least squared sense (weight is determined by `n[i] ** weightExponent`)
Expand All @@ -260,7 +274,7 @@ def fit_nnls(n, k, ascending=True, *, weight_exponent=1, erase_zeros=False):
- n: The input array of length N.
- k: The input array of length N.
- ascending: (optional) A boolean value indicating whether the output array should be in ascending order. Default value is True.
- weight_exponent: (optional) The exponent to be used for the weight array. Default value is 1.
- weight_exponent: (optional) The exponent to be used for the weight array. Default value is -1.
- erase_zeros: (optional) If True, rows corresponding to n==0 will be erased from the system of equations,
whereas if False (default), there will be equations inserted that try to minimize the jump in probabilities,
i.e. distribute the jumps evenly
Expand All @@ -273,23 +287,45 @@ def fit_nnls(n, k, ascending=True, *, weight_exponent=1, erase_zeros=False):
# This is more or less the same, just with
# JSPP Q: With what??
if not ascending:
return fit_nnls(n[::-1], k[::-1])[::-1]
n = n[::-1]
k = k[::-1]

N = len(n)
D = np.diag(n)
A = D @ np.tril(np.ones((N, N)))
w = n ** (0.5 * weight_exponent)
w = np.zeros_like(n, dtype=float)
w[n != 0] = n[n != 0] ** (0.5 * weight_exponent)
W = np.diag(w)

nz = (n == 0).nonzero()[0]
if len(nz) > 0:
if not erase_zeros:
A[nz, nz] = 1
A[nz, np.minimum(nz + 1, N - 1)] = -1
w[nz] = 1
k[nz] = 0
W = np.diag(w)
else:
W = np.delete(W, nz, axis=0)
R = np.eye(N)

zeros = (n == 0).nonzero()[0]
if len(zeros) > 0:
A[zeros, zeros] = 1
A[zeros, np.minimum(zeros + 1, N - 1)] = -1
w[zeros] = 1
k[zeros] = 0
W = np.diag(w)

if erase_zeros:
# The following lines remove variables that will end up being the
# same (matrix R) as well as equations that are essentially zero on
# both sides U). However, since this is a bit tricky, and difficult
# to maintain and does not seem to lower the condition of the
# matrix substantially, it is only activated on demand and left
# here more for further reference, in case it will be needed in the
# future.
nnz = n != 0
nnz[-1] = True
nnz2 = np.insert(nnz, 0, True)[:-1]

def make_perm_mat(rows, cols):
M = np.zeros((np.max(rows) + 1, np.max(cols) + 1))
M[rows, cols] = 1
return M

R = make_perm_mat(np.arange(N), nnz2.cumsum() - 1)
U = make_perm_mat(np.arange(sum(nnz)), nnz.nonzero()[0])
W = U @ W

# The default tolerance of nnls is too low, leading sometimes to
# non-convergent iterations and subsequent failures. A good tolerance
Expand All @@ -301,9 +337,13 @@ def fit_nnls(n, k, ascending=True, *, weight_exponent=1, erase_zeros=False):
# non-convergence and b) is fitting for the typical condition numbers and
# values of k seen in experiments.
tol = 1e-7
d, _ = nnls(W @ A, W @ k, atol=tol)
p = np.cumsum(d)
return p
d, _ = nnls(W @ A @ R, W @ k, atol=tol)
p = np.cumsum(R @ d)

if not ascending:
return p[::-1]
else:
return p


def hist_data_from_scores(scores, targets, bins=None, density=False):
Expand Down Expand Up @@ -386,7 +426,10 @@ def peps_from_scores_hist_nnls(scores, targets, scale_to_one=True):

# Do monotone fit, minimizing || n - diag(p) * k || with weights n over
# monotone descending p
pep_est = fit_nnls(n, k, ascending=False)
try:
pep_est = fit_nnls(n, k, ascending=False, weight_exponent=-1)
except RuntimeError as e:
raise PepsConvergenceError from e

if scale_to_one and pep_est[0] < 1:
pep_est = pep_est / pep_est[0]
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ find = {namespaces = false}
[tool.setuptools_scm]

[tool.ruff]
exclude = ["*.ipynb"]
extend-exclude = ["docs/source/conf.py"]
line-length = 79
target-version = "py39"
Expand All @@ -80,3 +81,4 @@ select = ["E", "F", "T20"] # T20 is for print() statements.

[tool.ruff.format]
docstring-code-format = true
preview = true
3 changes: 1 addition & 2 deletions tests/system_tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
@pytest.fixture
def scope_files():
"""Get the scope-ms files"""
return sorted(list(Path("data").glob("scope*")))
return sorted(list(Path("data").glob("scope*.pin")))


@pytest.fixture
Expand Down Expand Up @@ -69,7 +69,6 @@ def test_basic_cli(tmp_path, scope_files):
assert targets_psms_df.iloc[0, 5] == "sp|P10809|CH60_HUMAN"



def test_cli_options(tmp_path, scope_files):
"""Test non-defaults"""
params = [
Expand Down
Loading

0 comments on commit c82b1e0

Please sign in to comment.