Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhanced Sequencing Run Logging #343

Merged
merged 36 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
da45608
implemented report_gen submodule
Lilferrit Jun 18, 2024
2d6b5c3
report_gen documentation
Lilferrit Jun 18, 2024
28fa6c8
report_gen submodule test
Lilferrit Jun 19, 2024
97e5bf1
naming conventions
Lilferrit Jun 19, 2024
4f635f9
naming conventions
Lilferrit Jun 19, 2024
aa43a8c
PredictionWriter virtual class
Lilferrit Jun 21, 2024
46bb62c
multi prediction writer
Lilferrit Jun 21, 2024
40eecb1
LogPredicitonWriter wip
Lilferrit Jun 21, 2024
2d7effa
implemented logger io
Lilferrit Jun 21, 2024
a7beddf
removed report gen submodule
Lilferrit Jun 21, 2024
65b5a83
logger io test
Lilferrit Jun 21, 2024
1f656b6
logging info
Lilferrit Jun 21, 2024
4d2fab1
implemented end of run logging
Lilferrit Jun 21, 2024
9e903e7
Merge branch 'main' into run-report-logging
Lilferrit Jun 21, 2024
22f26c7
Generate new screengrabs with rich-codex
github-actions[bot] Jun 21, 2024
2f83bb7
logger io test fix
Lilferrit Jun 21, 2024
858704e
formatting fixes
Lilferrit Jun 21, 2024
6da1219
updated screeshots
Lilferrit Jun 21, 2024
bf6c20c
test file formatting
Lilferrit Jun 21, 2024
ed1b841
Restrict NumPy to pre-2.0
bittremieux Jun 24, 2024
968f60a
Update changelog
bittremieux Jun 24, 2024
0b12fb8
PredictionMultiWriter s\erialization
Lilferrit Jun 24, 2024
ff37b54
log writer error handling
Lilferrit Jun 24, 2024
dee9bf0
reformatting
Lilferrit Jun 24, 2024
411f717
Merge branch 'hotfix_numpy' of github.com:Noble-Lab/casanovo into run…
Lilferrit Jun 24, 2024
19d8aa8
verified skipped spectra counter
Lilferrit Jun 24, 2024
d467e87
Generate new screengrabs with rich-codex
github-actions[bot] Jun 24, 2024
56ef340
changelog merge confict
Lilferrit Jun 27, 2024
79c706e
migrated end of run report logging functionality to ms_io
Lilferrit Jun 28, 2024
4942a48
moved logging utility functions to util.py
Lilferrit Jul 3, 2024
66860e2
requested changes
Lilferrit Jul 8, 2024
a4d6649
more requested changes
Lilferrit Jul 9, 2024
13ce8a0
Merge branch 'dev' into run-report-logging
Lilferrit Jul 9, 2024
51df665
resolved dev merge conflicts
Lilferrit Jul 10, 2024
57b0284
Minor simplifications
bittremieux Jul 10, 2024
e0f5230
Fix tests
bittremieux Jul 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,6 @@ def sequence(

runner.predict(peak_path, output)

logger.info("DONE!")


@main.command(cls=_SharedParams)
@click.argument(
Expand Down
178 changes: 175 additions & 3 deletions casanovo/data/ms_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,99 @@

import collections
import csv
import logging
import operator
import os
import re
from datetime import datetime
from pathlib import Path
from typing import List
from socket import gethostname
from sys import argv
from time import time
from typing import List, Optional, Dict

import natsort
import numpy as np
import torch
from pandas import DataFrame

from .. import __version__
from ..config import Config

SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99]

logger = logging.getLogger("casanovo")


def get_score_bins(
results_table: DataFrame, score_bins: List[float]
) -> Dict[float, int]:
"""
From a list of confidence scores, return a dictionary mapping each confidence score
to the number of spectra with a confidence greater than or equal to it.

Parameters
----------
results_table: DataFrame
Parsed spectrum match table
score_bins: List[float]
Confidence scores to map

Returns
-------
score_bin_dict: Dict[float, int]
Dictionary mapping each confidence score to the number of spectra with a confidence
greater than or equal to it.
"""
se_scores = results_table["score"].to_numpy()
score_bin_dict = {
score: len(se_scores[se_scores >= score]) for score in score_bins
}
return score_bin_dict


def get_peptide_lengths(results_table: DataFrame) -> np.ndarray:
"""
Get a numpy array containing the length of each peptide sequence in results_table

Parameters
----------
results_table: DataFrame
Parsed spectrum match table

Returns
-------
sequence_lengths: np.ndarray
Numpy array containing the length of each sequence, listed in the same order
that the sequences are provided in.
"""
# Mass modifications do not contribute to sequence length
alpha_re = re.compile("[^a-zA-Z]")
filter_fun = lambda x: alpha_re.sub("", x)
peptide_sequences = results_table["sequence"].copy()
filtered_sequences = peptide_sequences.apply(filter_fun)
sequence_lengths = filtered_sequences.apply(len)

return sequence_lengths.to_numpy()


def get_peptide_length_histo(peptide_lengths: np.ndarray) -> Dict[int, int]:
"""
Get a dictionary mapping each unique peptide length to its frequency

Parameters
----------
peptide_lengths: np.ndarray
Numpy array containing the length of each sequence

Returns
-------
peptide_length_histogram: Dict[int, int]
Dictionary mapping each unique peptide length to its frequency
"""
lengths, counts = np.unique(peptide_lengths, return_counts=True)
return dict(zip(lengths.tolist(), counts.tolist()))


class MztabWriter:
"""
Expand All @@ -22,9 +104,12 @@
----------
filename : str
The name of the mzTab file.
score_bins : List[float] (optional)
Confidence score bins for generating sequence confidence score
cmf. Defaults to [0.0, 0.5, 0.9, 0.95, 0.99].
"""

def __init__(self, filename: str):
def __init__(self, filename: str, score_bins: List[float] = SCORE_BINS):
self.filename = filename
self.metadata = [
("mzTab-version", "1.0.0"),
Expand All @@ -43,6 +128,8 @@
]
self._run_map = {}
self.psms = []
self.start_time = time()
self.score_bins = score_bins

def set_metadata(self, config: Config, **kwargs) -> None:
"""
Expand Down Expand Up @@ -143,10 +230,95 @@
)
self._run_map[filename] = i

def get_report_dict(self) -> Optional[Dict]:
"""
Generate sequencing run report

Parameters
----------
score_bins: List[float], Optional
Confidence scores for creating confidence CMF, see getScoreBins

Returns:
report_gen: Dict
Generated report represented as a dictionary, or None if no
sequencing predictions were logged
"""
results_table = DataFrame(
{
"sequence": [psm[0] for psm in self.psms],
"score": [psm[2] for psm in self.psms],
}
)

if results_table.empty:
return None

Check warning on line 255 in casanovo/data/ms_io.py

View check run for this annotation

Codecov / codecov/patch

casanovo/data/ms_io.py#L255

Added line #L255 was not covered by tests

peptide_lengths = get_peptide_lengths(results_table)
return {
"num_spectra": len(results_table),
"score_bins": get_score_bins(results_table, self.score_bins),
"max_sequence_length": int(np.max(peptide_lengths)),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: Normally these should be integers already, so the conversion isn't necessary.

"min_sequence_length": int(np.min(peptide_lengths)),
"median_sequence_length": int(np.median(peptide_lengths)),
"peptide_length_histogram": get_peptide_length_histo(
peptide_lengths
),
}

def log_run_report(self) -> None:
"""
Log sequencing run report
"""
logger.info("======= Sequencing Run Report =======")
if self.start_time is not None:
end_time = time()
elapsed_time = end_time - self.start_time
logger.info(
f"Sequencing Run Start Timestamp: {int(self.start_time)}s"
)
logger.info(f"Sequencing Run End Timestamp: {int(end_time)}s")
logger.info(f"Time Elapsed: {int(elapsed_time)}s")

run_report = self.get_report_dict()
run_date_string = datetime.now().strftime("%m/%d/%y %H:%M:%S")
logger.info(f"Executed Command: {' '.join(argv)}")
logger.info(f"Executed on Host Machine: {gethostname()}")
logger.info(f"Sequencing run date: {run_date_string}")
num_spectra = 0 if run_report is None else run_report["num_spectra"]

if run_report is None:
logger.warning(

Check warning on line 291 in casanovo/data/ms_io.py

View check run for this annotation

Codecov / codecov/patch

casanovo/data/ms_io.py#L291

Added line #L291 was not covered by tests
f"No predictions were logged, this may be due to an error"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: No f-string needed.

)
else:
logger.info(f"Sequenced {num_spectra} spectra")

if run_report is not None:
logger.info(f"Score Distribution:")
for score, pop in sorted(run_report["score_bins"].items()):
pop_percentage = 100 * pop / num_spectra
logger.info(
f"{pop} spectra ({pop_percentage:.2f}%) scored >= {score}"
)

logger.info(
f"Max Sequence Length: {run_report['max_sequence_length']}"
)
logger.info(
f"Min Sequence Length: {run_report['min_sequence_length']}"
)

if torch.cuda.is_available():
gpu_util = torch.cuda.max_memory_allocated() / (10**6)
logger.info(f"Max GPU Memory Utilization: {int(gpu_util)}mb")

Check warning on line 314 in casanovo/data/ms_io.py

View check run for this annotation

Codecov / codecov/patch

casanovo/data/ms_io.py#L313-L314

Added lines #L313 - L314 were not covered by tests

def save(self) -> None:
"""
Export the spectrum identifications to the mzTab file.
Export the spectrum identifications to the mzTab file and
log end of run report
"""
self.log_run_report()
with open(self.filename, "w", newline="") as f:
writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep)
# Write metadata.
Expand Down
Loading
Loading