From dc757b90e2d26797293c75bd64f50277eee20901 Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 25 Dec 2023 12:48:11 +0100 Subject: [PATCH 1/8] Update `output` command-line description Fixes #241. --- CHANGELOG.md | 4 ++++ casanovo/casanovo.py | 15 ++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bbc9284e..4d2e54ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +### Changed + +- Update description of the `output` command-line argument to reflect that this is the root of the output (mzTab, log) files. + ## [4.0.0] - 2023-12-22 ### Added diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 0a1c3618..1fa53f6b 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -58,14 +58,17 @@ def __init__(self, *args, **kwargs) -> None: click.Option( ("-m", "--model"), help=""" - The model weights (.ckpt file). If not provided, Casanovo - will try to download the latest release. + The model weights (.ckpt file). If not provided, Casanovo will + try to download the latest release (during sequencing). """, type=click.Path(exists=True, dir_okay=False), ), click.Option( ("-o", "--output"), - help="The mzTab file to which results will be written.", + help="The root file name to which results (i.e. the mzTab file " + "during sequencing, as well as the log file during all " + "modes) will be written. If not specified, a default " + "timestamped file name will be used.", type=click.Path(dir_okay=False), ), click.Option( @@ -139,7 +142,7 @@ def sequence( for peak_file in peak_path: logger.info(" %s", peak_file) - runner.predict(peak_path, output) + runner.predict(peak_path, str(output)) logger.info("DONE!") @@ -328,7 +331,7 @@ def setup_model( config: Optional[str], output: Optional[Path], is_train: bool, -) -> Config: +) -> Tuple[Config, str]: """Setup Casanovo for most commands. Parameters @@ -347,6 +350,8 @@ def setup_model( ------ config : Config The parsed configuration + model : str + The name of the model weights. """ # Read parameters from the config file. config = Config(config) From ff896eac9b443d6305c85ef881baba06f88862d4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 25 Dec 2023 11:51:31 +0000 Subject: [PATCH 2/8] Generate new screengrabs with rich-codex --- docs/images/evaluate-help.svg | 162 ++++++++++++++----------- docs/images/sequence-help.svg | 162 ++++++++++++++----------- docs/images/train-help.svg | 222 +++++++++++++++++++--------------- 3 files changed, 313 insertions(+), 233 deletions(-) diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index e220664b..90998a59 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo evaluate --help - -Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  ANNOTATED_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mFILE                        The model weights (.ckpt file).  -                                              If not provided, Casanovo will   -                                              try to download the latest       -                                              release.                         ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help + +Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  ANNOTATED_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release (during sequencing).     +--output-oFILE                        The root file name to which      +                                              results (i.e. the mzTab file     +                                              during sequencing, as well as    +                                              the log file during all modes)   +                                              will be written. If not          +                                              specified, a default             +                                              timestamped file name will be    +                                              used.                            +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index d493e2b2..1429c1cd 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo sequence --help - -Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ ---model-mFILE                        The model weights (.ckpt file).  -                                              If not provided, Casanovo will   -                                              try to download the latest       -                                              release.                         ---output-oFILE                        The mzTab file to which results  -                                              will be written.                 ---config-cFILE                        The YAML configuration file      -                                              overriding the default options.  ---verbosity-v[debug|info|warning|error]  Set the verbosity of console     -                                              logging messages. Log files are  -                                              always set to 'debug'.           ---help-h  Show this message and exit.      -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release (during sequencing).     +--output-oFILE                        The root file name to which      +                                              results (i.e. the mzTab file     +                                              during sequencing, as well as    +                                              the log file during all modes)   +                                              will be written. If not          +                                              specified, a default             +                                              timestamped file name will be    +                                              used.                            +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index 82c30122..11e07aef 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo train --help - -Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -*  TRAIN_PEAK_PATH    FILE[required] -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -*--validation_peak_pa…-pFILE                    An annotated MGF file   -                                                       for validation, like    -                                                       from MassIVE-KB. Use    -                                                       this option multiple    -                                                       times to specify        -                                                       multiple files.         -[required]             ---model-mFILE                    The model weights       -                                                       (.ckpt file). If not    -                                                       provided, Casanovo      -                                                       will try to download    -                                                       the latest release.     ---output-oFILE                    The mzTab file to       -                                                       which results will be   -                                                       written.                ---config-cFILE                    The YAML configuration  -                                                       file overriding the     -                                                       default options.        ---verbosity-v[debug|info|warning|er  Set the verbosity of    -ror]  console logging         -                                                       messages. Log files     -                                                       are always set to       -                                                       'debug'.                ---help-h  Show this message and   -                                                       exit.                   -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +*--validation_peak_pa…-pFILE                    An annotated MGF file   +                                                       for validation, like    +                                                       from MassIVE-KB. Use    +                                                       this option multiple    +                                                       times to specify        +                                                       multiple files.         +[required]             +--model-mFILE                    The model weights       +                                                       (.ckpt file). If not    +                                                       provided, Casanovo      +                                                       will try to download    +                                                       the latest release      +                                                       (during sequencing).    +--output-oFILE                    The root file name to   +                                                       which results (i.e.     +                                                       the mzTab file during   +                                                       sequencing, as well as  +                                                       the log file during     +                                                       all modes) will be      +                                                       written. If not         +                                                       specified, a default    +                                                       timestamped file name   +                                                       will be used.           +--config-cFILE                    The YAML configuration  +                                                       file overriding the     +                                                       default options.        +--verbosity-v[debug|info|warning|er  Set the verbosity of    +ror]  console logging         +                                                       messages. Log files     +                                                       are always set to       +                                                       'debug'.                +--help-h  Show this message and   +                                                       exit.                   +╰──────────────────────────────────────────────────────────────────────────────╯ + From ac60313b1620d532f3f63caae4879eddbf2aff9a Mon Sep 17 00:00:00 2001 From: Wout Bittremieux Date: Mon, 25 Dec 2023 13:02:15 +0100 Subject: [PATCH 3/8] Fix formatting --- casanovo/casanovo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 1fa53f6b..b9915e2a 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -66,9 +66,9 @@ def __init__(self, *args, **kwargs) -> None: click.Option( ("-o", "--output"), help="The root file name to which results (i.e. the mzTab file " - "during sequencing, as well as the log file during all " - "modes) will be written. If not specified, a default " - "timestamped file name will be used.", + "during sequencing, as well as the log file during all modes) " + "will be written. If not specified, a default timestamped file " + "name will be used.", type=click.Path(dir_okay=False), ), click.Option( From a3fe4090f1a982f185d08b23855f854245e7baf9 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 26 Jun 2024 14:49:45 -0700 Subject: [PATCH 4/8] added train root option --- casanovo/casanovo.py | 16 +++++++++++++++- casanovo/denovo/model_runner.py | 6 ++++++ tests/test_integration.py | 7 +++++++ 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 8227e755..7c338953 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -197,9 +197,23 @@ def evaluate( multiple=True, type=click.Path(exists=True, dir_okay=False), ) +@click.option( + "-r", + "--root", + help=""" + Root name for all model checkpoints saved during training, + i.e. if root is specified as `--root foo` than all saved + checkpoints filename will have the format `foo.epoch=2-step=150000.ckpt`. + If root is not specified the checkpoint filenames will instead be of + the format `epoch=2-step=150000.ckpt`. + """, + required=False, + type=str, +) def train( train_peak_path: Tuple[str], validation_peak_path: Tuple[str], + root: Optional[str], model: Optional[str], config: Optional[str], output: Optional[str], @@ -212,7 +226,7 @@ def train( """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, True) - with ModelRunner(config, model) as runner: + with ModelRunner(config, model, root_checkpoint_name=root) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: logger.info(" %s", peak_file) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 4bd2165e..94ae06c3 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -42,6 +42,7 @@ def __init__( self, config: Config, model_filename: Optional[str] = None, + root_checkpoint_name: Optional[str] = None, ) -> None: """Initialize a ModelRunner""" self.config = config @@ -54,6 +55,10 @@ def __init__( self.loaders = None self.writer = None + checkpoint_filename = None + if root_checkpoint_name is not None: + checkpoint_filename = root_checkpoint_name + ".{epoch}-{step}" + # Configure checkpoints. if config.save_top_k is not None: self.callbacks = [ @@ -62,6 +67,7 @@ def __init__( monitor="valid_CELoss", mode="min", save_top_k=config.save_top_k, + filename=checkpoint_filename, ) ] else: diff --git a/tests/test_integration.py b/tests/test_integration.py index e5d4b285..8c2534a5 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -35,6 +35,13 @@ def test_train_and_run( assert result.exit_code == 0 assert model_file.exists() + # Train with root option specified + train_args.extend(["--root", "foobar"]) + named_model_file = tmp_path / "foobar.epoch=19-step=20.ckpt" + result = run(train_args) + assert result.exit_code == 0 + assert named_model_file.exists() + # Try evaluating: eval_args = [ "evaluate", From e484e8ec49bfb1f08573a1ad1f1a8e6eef202adb Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 26 Jun 2024 14:49:45 -0700 Subject: [PATCH 5/8] added train root option --- casanovo/casanovo.py | 18 +++++++++++++++++- casanovo/denovo/model_runner.py | 6 ++++++ tests/test_integration.py | 7 +++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 8227e755..3d81dc46 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -197,9 +197,23 @@ def evaluate( multiple=True, type=click.Path(exists=True, dir_okay=False), ) +@click.option( + "-r", + "--root_ckpt_name", + help=""" + Root name for all model checkpoints saved during training, + i.e. if root is specified as `--root_ckpt_name foo` than all saved + checkpoint filenames will be formatted as `foo.epoch=2-step=150000.ckpt`. + If root is not specified the checkpoint filenames will instead be formatted + as `epoch=2-step=150000.ckpt`. + """, + required=False, + type=str, +) def train( train_peak_path: Tuple[str], validation_peak_path: Tuple[str], + root_ckpt_name: Optional[str], model: Optional[str], config: Optional[str], output: Optional[str], @@ -212,7 +226,9 @@ def train( """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, True) - with ModelRunner(config, model) as runner: + with ModelRunner( + config, model, root_checkpoint_name=root_ckpt_name + ) as runner: logger.info("Training a model from:") for peak_file in train_peak_path: logger.info(" %s", peak_file) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 4bd2165e..94ae06c3 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -42,6 +42,7 @@ def __init__( self, config: Config, model_filename: Optional[str] = None, + root_checkpoint_name: Optional[str] = None, ) -> None: """Initialize a ModelRunner""" self.config = config @@ -54,6 +55,10 @@ def __init__( self.loaders = None self.writer = None + checkpoint_filename = None + if root_checkpoint_name is not None: + checkpoint_filename = root_checkpoint_name + ".{epoch}-{step}" + # Configure checkpoints. if config.save_top_k is not None: self.callbacks = [ @@ -62,6 +67,7 @@ def __init__( monitor="valid_CELoss", mode="min", save_top_k=config.save_top_k, + filename=checkpoint_filename, ) ] else: diff --git a/tests/test_integration.py b/tests/test_integration.py index e5d4b285..29c3e38b 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -35,6 +35,13 @@ def test_train_and_run( assert result.exit_code == 0 assert model_file.exists() + # Train with root option specified + train_args.extend(["--root_ckpt_name", "foobar"]) + named_model_file = tmp_path / "foobar.epoch=19-step=20.ckpt" + result = run(train_args) + assert result.exit_code == 0 + assert named_model_file.exists() + # Try evaluating: eval_args = [ "evaluate", From 758e57fa740623bcde77eebdd0b5b1384e631054 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 26 Jun 2024 16:06:44 -0700 Subject: [PATCH 6/8] implemented overwrite file option --- casanovo/casanovo.py | 37 ++++++++++++++++++++++++++++----- casanovo/denovo/model_runner.py | 2 +- tests/test_integration.py | 31 +++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 3d81dc46..46fdd105 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -91,6 +91,15 @@ def __init__(self, *args, **kwargs) -> None: ), default="info", ), + click.Option( + ("-d", "--overwrite_output"), + help=""" + Whether to overwrite sequencing output files (i.e. output .log and .mzTab files) + """, + is_flag=True, + show_default=True, + default=False + ) ] @@ -130,20 +139,21 @@ def sequence( config: Optional[str], output: Optional[str], verbosity: str, + overwrite_output: bool ) -> None: """De novo sequence peptides from tandem mass spectra. PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence peptides. """ - output = setup_logging(output, verbosity) + output = setup_logging(output, verbosity, not overwrite_output) config, model = setup_model(model, config, output, False) with ModelRunner(config, model) as runner: logger.info("Sequencing peptides from:") for peak_file in peak_path: logger.info(" %s", peak_file) - runner.predict(peak_path, str(output)) + runner.predict(peak_path, output) logger.info("DONE!") @@ -161,13 +171,14 @@ def evaluate( config: Optional[str], output: Optional[str], verbosity: str, + overwrite_output: bool ) -> None: """Evaluate de novo peptide sequencing performance. ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those provided by MassIVE-KB. """ - output = setup_logging(output, verbosity) + output = setup_logging(output, verbosity, not overwrite_output) config, model = setup_model(model, config, output, False) with ModelRunner(config, model) as runner: logger.info("Sequencing and evaluating peptides from:") @@ -218,13 +229,14 @@ def train( config: Optional[str], output: Optional[str], verbosity: str, + overwrite_output: bool, ) -> None: """Train a Casanovo model on your own data. TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided by MassIVE-KB, from which to train a new Casnovo model. """ - output = setup_logging(output, verbosity) + output = setup_logging(output, verbosity, not overwrite_output) config, model = setup_model(model, config, output, True) with ModelRunner( config, model, root_checkpoint_name=root_ckpt_name @@ -275,6 +287,7 @@ def configure(output: str) -> None: def setup_logging( output: Optional[str], verbosity: str, + check_overwrite: bool = False ) -> Path: """Set up the logger. @@ -292,11 +305,25 @@ def setup_logging( output : Path The output file path. """ + OUTPUT_SUFFIXES = [".log", ".mztab"] + if output is None: output = f"casanovo_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}" output = Path(output).expanduser().resolve() + if check_overwrite: + for output_suffix in OUTPUT_SUFFIXES: + next_path = output.with_suffix(output.suffix + output_suffix) + if not next_path.is_file(): + continue + + raise FileExistsError( + f"Output file {next_path} already exists, existing output files " + f"can't be overwritten without setting the --overwrite_output " + f"flag" + ) + logging_levels = { "debug": logging.DEBUG, "info": logging.INFO, @@ -323,7 +350,7 @@ def setup_logging( console_handler.setFormatter(console_formatter) root_logger.addHandler(console_handler) warnings_logger.addHandler(console_handler) - file_handler = logging.FileHandler(output.with_suffix(".log")) + file_handler = logging.FileHandler(output.with_suffix(output.suffix + ".log")) file_handler.setFormatter(log_formatter) root_logger.addHandler(file_handler) warnings_logger.addHandler(file_handler) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 94ae06c3..ff19fdec 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -152,7 +152,7 @@ def predict(self, peak_path: Iterable[str], output: str) -> None: ------- self """ - self.writer = ms_io.MztabWriter(Path(output).with_suffix(".mztab")) + self.writer = ms_io.MztabWriter(output.with_suffix(output.suffix + ".mztab")) self.writer.set_metadata( self.config, model=str(self.model_filename), diff --git a/tests/test_integration.py b/tests/test_integration.py index 29c3e38b..420ad9b8 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -3,6 +3,7 @@ import pyteomics.mztab from click.testing import CliRunner +from pytest import raises from casanovo import casanovo @@ -35,8 +36,12 @@ def test_train_and_run( assert result.exit_code == 0 assert model_file.exists() + # Test whether running training again raises exception + with raises(FileExistsError): + run(train_args) + # Train with root option specified - train_args.extend(["--root_ckpt_name", "foobar"]) + train_args.extend(["--root_ckpt_name", "foobar", "-d"]) named_model_file = tmp_path / "foobar.epoch=19-step=20.ckpt" result = run(train_args) assert result.exit_code == 0 @@ -57,8 +62,18 @@ def test_train_and_run( result = run(eval_args) assert result.exit_code == 0 + # Check whether running eval again raises FileExistsError + with raises(FileExistsError): + run(eval_args) + + # Check overwrite option in eval + eval_args.append("-d") + result = run(eval_args) + assert result.exit_code == 0 + # Finally try predicting: output_filename = tmp_path / "test.mztab" + output_arg = output_filename.with_suffix("") predict_args = [ "sequence", "--model", @@ -66,7 +81,7 @@ def test_train_and_run( "--config", tiny_config, "--output", - str(output_filename), + str(output_arg), str(mgf_small), str(mzml_small), ] @@ -75,6 +90,18 @@ def test_train_and_run( assert result.exit_code == 0 assert output_filename.is_file() + # Test whether trying to overwrite existing output will raise exception + with raises(FileExistsError): + run(predict_args) + + assert output_filename.is_file() + + # Set overwrite flag and test whether sequencing run succeeds + predict_args.append("--overwrite_output") + result = run(predict_args) + assert result.exit_code == 0 + assert output_filename.is_file() + mztab = pyteomics.mztab.MzTab(str(output_filename)) # Verify that both input peak files are listed in the metadata. for i, filename in enumerate(["small.mgf", "small.mzml"], 1): From 84b85c3afd2dea08eadfd8dcde02a993938fb132 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Wed, 26 Jun 2024 16:09:54 -0700 Subject: [PATCH 7/8] reformatting --- casanovo/casanovo.py | 16 ++++++++-------- casanovo/denovo/model_runner.py | 4 +++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 46fdd105..6a349f9c 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -98,8 +98,8 @@ def __init__(self, *args, **kwargs) -> None: """, is_flag=True, show_default=True, - default=False - ) + default=False, + ), ] @@ -139,7 +139,7 @@ def sequence( config: Optional[str], output: Optional[str], verbosity: str, - overwrite_output: bool + overwrite_output: bool, ) -> None: """De novo sequence peptides from tandem mass spectra. @@ -171,7 +171,7 @@ def evaluate( config: Optional[str], output: Optional[str], verbosity: str, - overwrite_output: bool + overwrite_output: bool, ) -> None: """Evaluate de novo peptide sequencing performance. @@ -285,9 +285,7 @@ def configure(output: str) -> None: def setup_logging( - output: Optional[str], - verbosity: str, - check_overwrite: bool = False + output: Optional[str], verbosity: str, check_overwrite: bool = False ) -> Path: """Set up the logger. @@ -350,7 +348,9 @@ def setup_logging( console_handler.setFormatter(console_formatter) root_logger.addHandler(console_handler) warnings_logger.addHandler(console_handler) - file_handler = logging.FileHandler(output.with_suffix(output.suffix + ".log")) + file_handler = logging.FileHandler( + output.with_suffix(output.suffix + ".log") + ) file_handler.setFormatter(log_formatter) root_logger.addHandler(file_handler) warnings_logger.addHandler(file_handler) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index ff19fdec..3641fb28 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -152,7 +152,9 @@ def predict(self, peak_path: Iterable[str], output: str) -> None: ------- self """ - self.writer = ms_io.MztabWriter(output.with_suffix(output.suffix + ".mztab")) + self.writer = ms_io.MztabWriter( + output.with_suffix(output.suffix + ".mztab") + ) self.writer.set_metadata( self.config, model=str(self.model_filename), From 18c95fb1cb9ef1220878dc103fb1aeb58b8d45e5 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 27 Jun 2024 00:54:14 +0000 Subject: [PATCH 8/8] Generate new screengrabs with rich-codex --- docs/images/configure-help.svg | 64 +++---- docs/images/evaluate-help.svg | 193 ++++++++++++++------- docs/images/help.svg | 157 +++++++++--------- docs/images/sequence-help.svg | 193 ++++++++++++++------- docs/images/train-help.svg | 295 +++++++++++++++++++++++---------- 5 files changed, 591 insertions(+), 311 deletions(-) diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg index 0822927a..b1fcce10 100644 --- a/docs/images/configure-help.svg +++ b/docs/images/configure-help.svg @@ -19,57 +19,63 @@ font-weight: 700; } - .terminal-3936755216-matrix { + .terminal-2766440694-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-3936755216-title { + .terminal-2766440694-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-3936755216-r1 { fill: #c5c8c6 } + .terminal-2766440694-r1 { fill: #c5c8c6 } +.terminal-2766440694-r2 { fill: #d0b344 } +.terminal-2766440694-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-2766440694-r4 { fill: #68a0b3;font-weight: bold } +.terminal-2766440694-r5 { fill: #868887 } +.terminal-2766440694-r6 { fill: #98a84b;font-weight: bold } +.terminal-2766440694-r7 { fill: #d0b344;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + @@ -81,21 +87,21 @@ - + - - $ casanovo configure --help - - Usage: casanovo configure [OPTIONS]                                             - - Generate a Casanovo configuration file to customize.                            - The casanovo configuration file is in the YAML format.                          - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --output  -o  FILE  The output configuration file.                           │ -│ --help    -h        Show this message and exit.                              │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo configure --help + +Usage:casanovo configure [OPTIONS]                                             + + Generate a Casanovo configuration file to customize.                            + The casanovo configuration file is in the YAML format.                          + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--output-oFILE  The output configuration file.                            +--help-h  Show this message and exit.                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index b16c4ffd..466fcd17 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo evaluate --help - - Usage: casanovo evaluate [OPTIONS] ANNOTATED_PEAK_PATH...                       - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  ANNOTATED_PEAK_PATH    FILE  [required]                                   │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --model      -m  FILE                        The model weights (.ckpt file). │ -│                                              If not provided, Casanovo will  │ -│                                              try to download the latest      │ -│                                              release.                        │ -│ --output     -o  FILE                        The mzTab file to which results │ -│                                              will be written.                │ -│ --config     -c  FILE                        The YAML configuration file     │ -│                                              overriding the default options. │ -│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ -│                                              logging messages. Log files are │ -│                                              always set to 'debug'.          │ -│ --help       -h                              Show this message and exit.     │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help + +Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  ANNOTATED_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                      The model weights (.ckpt    +                                                   file). If not provided,     +                                                   Casanovo will try to        +                                                   download the latest         +                                                   release (during             +                                                   sequencing).                +--output-oFILE                      The root file name to       +                                                   which results (i.e. the     +                                                   mzTab file during           +                                                   sequencing, as well as the  +                                                   log file during all modes)  +                                                   will be written. If not     +                                                   specified, a default        +                                                   timestamped file name will  +                                                   be used.                    +--config-cFILE                      The YAML configuration      +                                                   file overriding the         +                                                   default options.            +--verbosity-v[debug|info|warning|erro  Set the verbosity of        +r]  console logging messages.   +                                                   Log files are always set    +                                                   to 'debug'.                 +--overwrite_output-d  Whether to overwrite        +                                                   sequencing output files     +                                                   (i.e. output .log and       +                                                   .mzTab files)               +--help-h  Show this message and       +                                                   exit.                       +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/help.svg b/docs/images/help.svg index 2e22e2d3..eae8e93d 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -19,126 +19,133 @@ font-weight: 700; } - .terminal-100512290-matrix { + .terminal-771077830-matrix { font-family: Fira Code, monospace; font-size: 20px; line-height: 24.4px; font-variant-east-asian: full-width; } - .terminal-100512290-title { + .terminal-771077830-title { font-size: 18px; font-weight: bold; font-family: arial; } - .terminal-100512290-r1 { fill: #c5c8c6 } + .terminal-771077830-r1 { fill: #c5c8c6 } +.terminal-771077830-r2 { fill: #d0b344 } +.terminal-771077830-r3 { fill: #c5c8c6;font-weight: bold } +.terminal-771077830-r4 { fill: #68a0b3;font-weight: bold } +.terminal-771077830-r5 { fill: #d0b344;font-weight: bold } +.terminal-771077830-r6 { fill: #608ab1;text-decoration: underline; } +.terminal-771077830-r7 { fill: #868887 } +.terminal-771077830-r8 { fill: #98a84b;font-weight: bold } - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + @@ -150,44 +157,44 @@ - + - - $ casanovo --help - - Usage: casanovo [OPTIONS] COMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - -  • Documentation: https://casanovo.readthedocs.io                               -  • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - -  • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -    mass spectrometry peptide sequencing with a transformer model. Proceedings   -    of the 39th International Conference on Machine Learning - ICML '22 (2022)   -    doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help  -h    Show this message and exit.                                    │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -│ configure  Generate a Casanovo configuration file to customize.              │ -│ evaluate   Evaluate de novo peptide sequencing performance.                  │ -│ sequence   De novo sequence peptides from tandem mass spectra.               │ -│ train      Train a Casanovo model on your own data.                          │ -│ version    Get the Casanovo version information                              │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io + • Official code repository: https://github.com/Noble-Lab/casanovo + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +evaluate  Evaluate de novo peptide sequencing performance.                   +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index f5799766..07b1c48b 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo sequence --help - - Usage: casanovo sequence [OPTIONS] PEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  PEAK_PATH    FILE  [required]                                             │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --model      -m  FILE                        The model weights (.ckpt file). │ -│                                              If not provided, Casanovo will  │ -│                                              try to download the latest      │ -│                                              release.                        │ -│ --output     -o  FILE                        The mzTab file to which results │ -│                                              will be written.                │ -│ --config     -c  FILE                        The YAML configuration file     │ -│                                              overriding the default options. │ -│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ -│                                              logging messages. Log files are │ -│                                              always set to 'debug'.          │ -│ --help       -h                              Show this message and exit.     │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                      The model weights (.ckpt    +                                                   file). If not provided,     +                                                   Casanovo will try to        +                                                   download the latest         +                                                   release (during             +                                                   sequencing).                +--output-oFILE                      The root file name to       +                                                   which results (i.e. the     +                                                   mzTab file during           +                                                   sequencing, as well as the  +                                                   log file during all modes)  +                                                   will be written. If not     +                                                   specified, a default        +                                                   timestamped file name will  +                                                   be used.                    +--config-cFILE                      The YAML configuration      +                                                   file overriding the         +                                                   default options.            +--verbosity-v[debug|info|warning|erro  Set the verbosity of        +r]  console logging messages.   +                                                   Log files are always set    +                                                   to 'debug'.                 +--overwrite_output-d  Whether to overwrite        +                                                   sequencing output files     +                                                   (i.e. output .log and       +                                                   .mzTab files)               +--help-h  Show this message and       +                                                   exit.                       +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index fccd4140..1424a076 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - - - - $ casanovo train --help - - Usage: casanovo train [OPTIONS] TRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  TRAIN_PEAK_PATH    FILE  [required]                                       │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ *  --validation_peak_pa…  -p  FILE                    An annotated MGF file  │ -│                                                       for validation, like   │ -│                                                       from MassIVE-KB. Use   │ -│                                                       this option multiple   │ -│                                                       times to specify       │ -│                                                       multiple files.        │ -│                                                       [required]             │ -│    --model                -m  FILE                    The model weights      │ -│                                                       (.ckpt file). If not   │ -│                                                       provided, Casanovo     │ -│                                                       will try to download   │ -│                                                       the latest release.    │ -│    --output               -o  FILE                    The mzTab file to      │ -│                                                       which results will be  │ -│                                                       written.               │ -│    --config               -c  FILE                    The YAML configuration │ -│                                                       file overriding the    │ -│                                                       default options.       │ -│    --verbosity            -v  [debug|info|warning|er  Set the verbosity of   │ -│                               ror]                    console logging        │ -│                                                       messages. Log files    │ -│                                                       are always set to      │ -│                                                       'debug'.               │ -│    --help                 -h                          Show this message and  │ -│                                                       exit.                  │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +*--validation_peak_pa…-pFILE                    An annotated MGF file   +                                                       for validation, like    +                                                       from MassIVE-KB. Use    +                                                       this option multiple    +                                                       times to specify        +                                                       multiple files.         +[required]             +--root_ckpt_name-rTEXT                    Root name for all       +                                                       model checkpoints       +                                                       saved during training,  +                                                       i.e. if root is         +                                                       specified as            +--root_ckpt_name foo +                                                       than all saved          +                                                       checkpoint filenames    +                                                       will be formatted as    +foo.epoch=2-step=1500… +                                                       If root is not          +                                                       specified the           +                                                       checkpoint filenames    +                                                       will instead be         +                                                       formatted as            +epoch=2-step=150000.c… +--model-mFILE                    The model weights       +                                                       (.ckpt file). If not    +                                                       provided, Casanovo      +                                                       will try to download    +                                                       the latest release      +                                                       (during sequencing).    +--output-oFILE                    The root file name to   +                                                       which results (i.e.     +                                                       the mzTab file during   +                                                       sequencing, as well as  +                                                       the log file during     +                                                       all modes) will be      +                                                       written. If not         +                                                       specified, a default    +                                                       timestamped file name   +                                                       will be used.           +--config-cFILE                    The YAML configuration  +                                                       file overriding the     +                                                       default options.        +--verbosity-v[debug|info|warning|er  Set the verbosity of    +ror]  console logging         +                                                       messages. Log files     +                                                       are always set to       +                                                       'debug'.                +--overwrite_output-d  Whether to overwrite    +                                                       sequencing output       +                                                       files (i.e. output      +                                                       .log and .mzTab files)  +--help-h  Show this message and   +                                                       exit.                   +╰──────────────────────────────────────────────────────────────────────────────╯ +