From 141f344fe92c6cc6fc70dc5562337f341bb433cf Mon Sep 17 00:00:00 2001 From: jfnavarro Date: Sun, 12 Jan 2025 23:22:42 +0100 Subject: [PATCH] Almost all tests working --- AUTHORS.md | 3 - CHANGELOG.md | 3 + README.md | 54 +++++---- README_SHORT.md | 42 ------- data/README.md | 2 + docsrc/changes.rst | 5 +- docsrc/example.rst | 2 +- docsrc/installation.rst | 8 +- docsrc/license.rst | 2 +- docsrc/manual.rst | 2 +- pyproject.toml | 20 ++-- stpipeline/common/filter.py | 2 +- stpipeline/common/utils.py | 34 ++---- stpipeline/core/pipeline.py | 6 +- {scripts => stpipeline/scripts}/__init__.py | 0 .../scripts}/adjust_matrix_coordinates.py | 34 +++--- .../scripts}/convertEnsemblToNames.py | 18 +-- .../scripts}/filter_gene_type_matrix.py | 44 +++---- .../scripts}/merge_fastq.py | 30 ++--- {scripts => stpipeline/scripts}/multi_qa.py | 28 +++-- .../scripts}/st_pipeline_run.py | 0 {scripts => stpipeline/scripts}/st_qa.py | 14 ++- tests/filter_test.py | 4 +- tests/integration_test.py | 110 ++++++++---------- tests/utils_test.py | 5 +- 25 files changed, 215 insertions(+), 257 deletions(-) delete mode 100644 README_SHORT.md rename {scripts => stpipeline/scripts}/__init__.py (100%) rename {scripts => stpipeline/scripts}/adjust_matrix_coordinates.py (93%) rename {scripts => stpipeline/scripts}/convertEnsemblToNames.py (95%) rename {scripts => stpipeline/scripts}/filter_gene_type_matrix.py (91%) rename {scripts => stpipeline/scripts}/merge_fastq.py (95%) rename {scripts => stpipeline/scripts}/multi_qa.py (97%) rename {scripts => stpipeline/scripts}/st_pipeline_run.py (100%) rename {scripts => stpipeline/scripts}/st_qa.py (98%) diff --git a/AUTHORS.md b/AUTHORS.md index 87335f5..b346ebe 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,5 +1,2 @@ ## Author: - Jose Fernandez Navarro - -## Contributors: -- Erik Borgström diff --git a/CHANGELOG.md b/CHANGELOG.md index a9ef7d3..9ce96f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,9 @@ * Added Docker container * Added tox * Updated versions of dependencies +* Perform code optimizations +* Add test for full coveragge +* Bump taggd to 0.4.0 ## Version 1.8.2 * Added annotation (htseq) feature type as parameter diff --git a/README.md b/README.md index 01329ef..0884c98 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ # Spatial Transcriptomics Pipeline [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -[![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) [![Python 3.10](https://img.shields.io/badge/python-3.10-blue.svg)](https://www.python.org/downloads/release/python-310/) [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/release/python-311/) +[![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-312/) [![PyPI version](https://badge.fury.io/py/stpipeline.svg)](https://badge.fury.io/py/stpipeline) [![Build Status](https://github.com/jfnavarro/st_pipeline/actions/workflows/dev.yml/badge.svg)](https://github.com/jfnavarro/st_pipeline/actions/workflows/dev) The ST Pipeline contains the tools and scripts needed to process and analyze the raw -files generated with Spatial Transcriptomics and Visium raw data in FASTQ format to generate datasets for down-stream analysis. +files generated with Spatial Transcriptomics and Visium in FASTQ format to generate datasets for down-stream analysis. The ST pipeline can also be used to process single cell RNA-seq data as long as a file with barcodes identifying each cell is provided (same template as the files in the folder "ids"). @@ -22,7 +22,7 @@ The following files/parameters are commonly required : - FASTQ files (Read 1 containing the spatial information and the UMI and read 2 containing the genomic sequence) - A genome index generated with STAR - An annotation file in GTF or GFF3 format (optional when using a transcriptome) -- The file containing the barcodes and array coordinates (look at the folder "ids" to use it as a reference). +- A file containing the barcodes and array coordinates (look at the folder "ids" to use it as a reference). Basically this file contains 3 columns (BARCODE, X and Y), so if you provide this file with barcodes identifying cells (for example), the ST pipeline can be used for single cell data. This file is also optional if the data is not barcoded (for example RNA-Seq data). @@ -30,7 +30,7 @@ This file is also optional if the data is not barcoded (for example RNA-Seq data The ST pipeline has multiple parameters mostly related to trimming, mapping and annotation but generally the default values are good enough. You can see a full -description of the parameters typing "st_pipeline_run.py --help" after you have installed the ST pipeline. +description of the parameters typing `st_pipeline_run --help` after you have installed the ST pipeline. The input FASTQ files can be given in gzip/bzip format as well. @@ -57,9 +57,9 @@ The ST pipeline will also output a log file with useful stats and information. ## Installation -We recommend you install a virtual environment like Pyenv or Anaconda before you install the pipeline. +We recommend you install a virtual environment like `pyenv` or `Anaconda` before you install the pipeline. -The ST Pipeline works with python 3.9, 3.10 and 3.11. +The ST Pipeline works with python 3.10, 3.11 and 3.12. You can install the ST Pipeline using PyPy: @@ -90,14 +90,20 @@ cd stpipeline To install the pipeline type: ```bash -python setup.py build -python setup.py install +pip install . +``` + +You can also use `poetry` to install the pipeline: + +```bash +poetry build +poetry install ``` To see the different options type: ```bash -st_pipeline_run.py --help +st_pipeline_run --help ``` ## Testing @@ -105,8 +111,13 @@ st_pipeline_run.py --help To run a test type (make sure you are inside the source folder): ```bash -python setup.py test -python -m unittest testrun.py +pytest +``` + +or + +```bash +poetry run pytest ``` ## Requirements @@ -120,13 +131,6 @@ If you use anaconda you can install STAR with: conda install -c bioconda star ``` -The ST Pipeline requires `samtools` installed in the system -If you use anaconda you can install samtools with: - -```bash -conda install -c bioconda samtools openssl=1.0 -``` - The ST Pipeline needs a computer with at least 32GB of RAM (depending on the size of the genome) and 8 cpu cores. ## Dependencies @@ -140,7 +144,7 @@ You can see them in the file `requirements.txt` An example run would be: ```bash -st_pipeline_run.py --expName test --ids ids_file.txt \ +st_pipeline_run --expName test --ids ids_file.txt \ --ref-map path_to_index --log-file log_file.txt --output-folder /home/me/results \ --ref-annotation annotation_file.gtf file1.fastq file2.fastq ``` @@ -164,7 +168,7 @@ the output file so it contains gene ids/names instead of Ensembl ids. You can use this tool that comes with the ST Pipeline ```bash -convertEnsemblToNames.py --annotation path_to_annotation_file --output st_data_updated.tsv st_data.tsv +convertEnsemblToNames --annotation path_to_annotation_file --output st_data_updated.tsv st_data.tsv ``` ## Merge demultiplexed FASTQ files @@ -173,7 +177,7 @@ If you used different indexes to sequence and need to merge the files you can use the script `merge_fastq.py` that comes with the ST Pipeline ```bash -merge_fastq.py --run-path path_to_run_folder --out-path path_to_output --identifiers S1 S2 S3 S4 +merge_fastq --run-path path_to_run_folder --out-path path_to_output --identifiers S1 S2 S3 S4 ``` Where `--identifiers` will be strings that identify each demultiplexed sample. @@ -185,7 +189,7 @@ to certain gene types (For instance to keep only protein_coding). You can do so with the script `filter_gene_type_matrix.py` that comes with the ST Pipeline ```bash -filter_gene_type_matrix.py --gene-types-keep protein-coding --annotation path_to_annotation_file stdata.tsv +filter_gene_type_matrix --gene-types-keep protein-coding --annotation path_to_annotation_file stdata.tsv ``` You may include the parameter `--ensembl-ids` if your genes are represented as emsembl ids instead. @@ -197,7 +201,7 @@ to keep only spots inside the tissue. You can do so with the script `adjust_matr that comes with the ST Pipeline ```bash -adjust_matrix_coordinates.py --outfile new_stdata.tsv --coordinates-file coordinates.txt stdata.tsv +adjust_matrix_coordinates --outfile new_stdata.tsv --coordinates-file coordinates.txt stdata.tsv ``` Where `coordinates.txt` will be a tab delimited file with 6 columns: @@ -215,13 +219,13 @@ The ST Pipeline generate useful stats/QC information in the LOG file but if you want to obtain more detailed information about the quality of the data, you can run the following script: ```bash -st_qa.py stdata.tsv +st_qa stdata.tsv ``` If you want to perform quality stats on multiple datasets you can run: ```bash -multi_qa.py stdata1.tsv stadata2.tsv stdata3.tsv stdata4.tsv +multi_qa stdata1.tsv stadata2.tsv stdata3.tsv stdata4.tsv ``` Multi_qa.py generates violing plots, correlation plots/tables and more useful information and diff --git a/README_SHORT.md b/README_SHORT.md deleted file mode 100644 index dbc934b..0000000 --- a/README_SHORT.md +++ /dev/null @@ -1,42 +0,0 @@ -The ST Pipeline has been optimized for speed, robustness and it is very easy -to use with many parameters to adjust all the settings. -The ST Pipeline is fully parallel and has constant memory use. -The ST Pipeline allows to skip any of the steps and multiple customization options. -The ST Pipeline allows to use the genome or the transcriptome as reference. - -The following files/parameters are commonly required : -- FASTQ files (Read 1 containing the spatial information and the UMI and read 2 containing the genomic sequence) -- A genome index generated with STAR -- An annotation file in GTF or GFF3 format (optional when using a transcriptome) -- The file containing the barcodes and array coordinates (look at the folder "ids" to use it as a reference). -Basically this file contains 3 columns (BARCODE, X and Y), so if you provide this -file with barcodes identifying cells (for example), the ST pipeline can be used for single cell data. -This file is also optional if the data is not barcoded (for example RNA-Seq data). -- A name for the dataset - -The ST pipeline has multiple parameters mostly related to trimming, mapping and annotation -but generally the default values are good enough. You can see a full -description of the parameters typing "st_pipeline_run.py --help" after you have installed the ST pipeline. - -The input FASTQ files can be given in gzip/bzip format as well. - -Basically what the ST pipeline does (default mode) is : -- Quality trimming (read 1 and read 2): - - Remove low quality bases - - Sanity check (reads same length, reads order, etc..) - - Check quality UMI - - Remove artifacts (PolyT, PolyA, PolyG, PolyN and PolyC) of user defined length - - Check for AT and GC content - - Discard reads with a minimum number of bases of that failed any of the checks above -- Contamimant filter e.x. rRNA genome (Optional) -- Mapping with STAR (only read 2) -- Demultiplexing with [Taggd](https://github.com/jfnavarro/taggd) (only read 1) -- Keep reads (read 2) that contain a valid barcode and are correctly mapped -- Annotate the reads with htseq-count (slightly modified version) -- Group annotated reads by barcode (spot position), gene and genomic location (with an offset) to get a read count -- In the grouping/counting only unique molecules (UMIs) are kept. - -You can see a graphical more detailed description of the workflow in the documents `workflow.pdf` and `workflow_extended.pdf` - -The output dataset is a matrix of counts (genes as columns, spots as rows) in TSV format. -The ST pipeline will also output a log file with useful stats and information. diff --git a/data/README.md b/data/README.md index 7275525..e2f41eb 100644 --- a/data/README.md +++ b/data/README.md @@ -1,3 +1,5 @@ +# Test dataset + These dataset were generated from the publicly available raw FASTQ files of the Mouse Olfatory Bulb Replicates number 4 and 9 diff --git a/docsrc/changes.rst b/docsrc/changes.rst index 1f63a72..8b7f911 100644 --- a/docsrc/changes.rst +++ b/docsrc/changes.rst @@ -6,10 +6,13 @@ Changes * Refactor code to modern Python (black, mypy, isort) * Added Github Actions * Added pre-commit hooks -* Added Docker container * Change the build configuration to Poetry +* Added Docker container * Added tox * Updated versions of dependencies +* Perform code optimizations +* Add test for full coveragge +* Bump taggd to 0.4.0 **Version 1.8.2** * Added annotation (htseq) feature type as parameter diff --git a/docsrc/example.rst b/docsrc/example.rst index b33f429..b3b9d43 100644 --- a/docsrc/example.rst +++ b/docsrc/example.rst @@ -25,7 +25,7 @@ The following is an example of an BASH file to run the ST pipeline. EXP=YOUR_EXP_NAME # Running the pipeline - st_pipeline_run.py \ + st_pipeline_run \ --output-folder $OUTPUT \ --ids $ID \ --ref-map $MAP \ diff --git a/docsrc/installation.rst b/docsrc/installation.rst index fa6a7a4..454d948 100644 --- a/docsrc/installation.rst +++ b/docsrc/installation.rst @@ -10,7 +10,7 @@ We recommend to download and install Anaconda (https://www.anaconda.com/products We then create a virtual environment from which we will run the pipeline in. Type the following command: - ``conda create -n stpipeline python=3.9`` + ``conda create -n stpipeline python=3.10`` The name for the virtual environment that we have just created is specified by the -n flag. Here is is called stpipeline, but this can be anything that you want @@ -46,9 +46,11 @@ Activate the virtual environment (if not already active) Install the pipeline - ``python setup.py build`` + ``pip install .`` - ``python setup.py install`` +or using poetry: + + ``poetry install`` Alternatively, you can simply install the pipeline using PyPy: diff --git a/docsrc/license.rst b/docsrc/license.rst index 314cb35..3a023dd 100644 --- a/docsrc/license.rst +++ b/docsrc/license.rst @@ -2,7 +2,7 @@ License ------- The MIT License (MIT) -Copyright (c) 2020 Jose Fernandez Navarro. +Copyright (c) 2024 Jose Fernandez Navarro. All rights reserved. * Jose Fernandez Navarro diff --git a/docsrc/manual.rst b/docsrc/manual.rst index c5084c8..21dda8d 100644 --- a/docsrc/manual.rst +++ b/docsrc/manual.rst @@ -13,7 +13,7 @@ the path to a STAR genome index, the path to a annotation file in GTF format and a dataset name. The ST Pipeline has many parameters, you can see a description of them -by typing : st_pipeline_run.py --help +by typing : st_pipeline_run --help Note that the minimum read length is dependant on the type of kit used, and should be adjusted accordingly, i.e. a 150bp kit should have a different diff --git a/pyproject.toml b/pyproject.toml index d178df2..746c492 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ version = "2.0.0" description = "ST Pipeline: An automated pipeline for spatial mapping of unique transcripts" authors = ["Jose Fernandez Navarro "] license = "MIT" -readme = "README_SHORT.md" +readme = "README.md" keywords = ["visium", "analysis", "pipeline", "spatial", "transcriptomics", "toolkit"] repository = "https://github.com/jfnavarro/st_pipeline" classifiers = [ @@ -22,10 +22,8 @@ classifiers = [ ] include = [ { path = "README.md" }, - { path = "README_SHORT.md" }, { path = "LICENSE" }, - { path = "doc/**" }, - { path = "scripts/**" } + { path = "doc/**" } ] [tool.poetry.dependencies] @@ -47,13 +45,13 @@ dnaio = "^1.2.3" distance = "^0.1.3" [tool.poetry.scripts] -st_qa = "scripts.st_qa:main" -st_pipeline_run = "scripts.st_pipeline_run:main" -multi_qa = "scripts.multi_qa:main" -merge_fastq = "scripts.merge_fastq:main" -filter_gene_type_matrix = "scripts.filter_gene_type_matrix:main" -convertEnsemblToNames = "scripts.convertEnsemblToNames:main" -adjust_matrix_coordinates = "scripts.adjust_matrix_coordinates:main" +st_qa = "stpipeline.scripts.st_qa:main" +st_pipeline_run = "stpipeline.scripts.st_pipeline_run:main" +multi_qa = "stpipeline.scripts.multi_qa:main" +merge_fastq = "stpipeline.scripts.merge_fastq:main" +filter_gene_type_matrix = "stpipeline.scripts.filter_gene_type_matrix:main" +convertEnsemblToNames = "stpipeline.scripts.convertEnsemblToNames:main" +adjust_matrix_coordinates = "stpipeline.scripts.adjust_matrix_coordinates:main" [tool.poetry.extras] test = [ diff --git a/stpipeline/common/filter.py b/stpipeline/common/filter.py index ed3d294..e1b90d3 100644 --- a/stpipeline/common/filter.py +++ b/stpipeline/common/filter.py @@ -110,7 +110,7 @@ def filter_input_data( dropped_adaptor = 0 too_short_after_trimming = 0 - bam_file = pysam.AlignmentFile(out_file, "wb") + bam_file = pysam.AlignmentFile(out_file, "wb", header=bam_header) if keep_discarded_files: out_writer_discarded = dnaio.open(out_file_discarded, mode="w") # type: ignore diff --git a/stpipeline/common/utils.py b/stpipeline/common/utils.py index 464d74c..1619434 100755 --- a/stpipeline/common/utils.py +++ b/stpipeline/common/utils.py @@ -5,39 +5,21 @@ from datetime import datetime import os import subprocess -from typing import Optional, Generator, IO, Any +from typing import Optional, IO, Any +import shutil -def which_program(program: str) -> Optional[str]: +def which_program(program: str) -> bool: """ - Checks if a program exists and is executable. + Check if a program is installed and available in the system's PATH. Args: - program: The program name. + program: The name of the program to check. Returns: - The full path to the program if found, otherwise None. - """ - - def is_exe(fpath: str) -> bool: - return fpath is not None and os.path.exists(fpath) and os.access(fpath, os.X_OK) - - def ext_candidates(fpath: str) -> Generator[str, None, None]: - yield fpath - for ext in os.environ.get("PATHEXT", "").split(os.pathsep): - yield fpath + ext - - fpath, _ = os.path.split(program) - if fpath: - if is_exe(program): - return program - else: - for path in os.environ["PATH"].split(os.pathsep): - exe_file = os.path.join(path, program) - for candidate in ext_candidates(exe_file): - if is_exe(candidate): - return candidate - return None + True if the program is found and executable, False otherwise. + """ + return shutil.which(program) is not None class TimeStamper: diff --git a/stpipeline/core/pipeline.py b/stpipeline/core/pipeline.py index 4ae1db6..9971f77 100755 --- a/stpipeline/core/pipeline.py +++ b/stpipeline/core/pipeline.py @@ -283,13 +283,13 @@ def sanityCheck(self) -> None: raise RuntimeError(error) # Test the presence of the required tools - required_scripts = set("STAR") + required_scripts = ["STAR"] unavailable_scripts = set() for script in required_scripts: - if which_program(script) is None: + if not which_program(script): unavailable_scripts.add(script) if len(unavailable_scripts) != 0: - error = "Error starting the pipeline.\n" "Required software not found:\t".join(unavailable_scripts) + error = f"Error starting the pipeline. Required software not found: {' '.join(required_scripts)}" logger.error(error) raise RuntimeError(error) diff --git a/scripts/__init__.py b/stpipeline/scripts/__init__.py similarity index 100% rename from scripts/__init__.py rename to stpipeline/scripts/__init__.py diff --git a/scripts/adjust_matrix_coordinates.py b/stpipeline/scripts/adjust_matrix_coordinates.py similarity index 93% rename from scripts/adjust_matrix_coordinates.py rename to stpipeline/scripts/adjust_matrix_coordinates.py index 43e16ff..68d7277 100644 --- a/scripts/adjust_matrix_coordinates.py +++ b/stpipeline/scripts/adjust_matrix_coordinates.py @@ -34,7 +34,24 @@ import pandas as pd -def main(counts_matrix: str, coordinates_file: str, update_coordinates: bool, outfile: str) -> int: +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format") + parser.add_argument("--outfile", help="Name of the output file") + parser.add_argument( + "--update-coordinates", + action="store_true", + default=False, + help="Updates the spot coordinates in the output matrix with the\n" + "new coordinates present in the coordinates file", + ) + parser.add_argument("--coordinates-file", required=True, help="New coordinates in a tab delimited file") + args = parser.parse_args() + + sys.exit(run(args.counts_matrix, args.coordinates_file, args.update_coordinates, args.outfile)) + + +def run(counts_matrix: str, coordinates_file: str, update_coordinates: bool, outfile: str) -> int: if not os.path.isfile(counts_matrix) or not os.path.isfile(coordinates_file): print("Error, input file(s) not present or invalid format") return 1 @@ -87,17 +104,4 @@ def main(counts_matrix: str, coordinates_file: str, update_coordinates: bool, ou if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format") - parser.add_argument("--outfile", help="Name of the output file") - parser.add_argument( - "--update-coordinates", - action="store_true", - default=False, - help="Updates the spot coordinates in the output matrix with the\n" - "new coordinates present in the coordinates file", - ) - parser.add_argument("--coordinates-file", required=True, help="New coordinates in a tab delimited file") - args = parser.parse_args() - - sys.exit(main(args.counts_matrix, args.coordinates_file, args.update_coordinates, args.outfile)) + main() diff --git a/scripts/convertEnsemblToNames.py b/stpipeline/scripts/convertEnsemblToNames.py similarity index 95% rename from scripts/convertEnsemblToNames.py rename to stpipeline/scripts/convertEnsemblToNames.py index 2b7f84f..dd24d63 100755 --- a/scripts/convertEnsemblToNames.py +++ b/stpipeline/scripts/convertEnsemblToNames.py @@ -18,7 +18,16 @@ from collections import Counter -def main(st_data_file: str, annotation: str, output_file: str) -> int: +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format") + parser.add_argument("--output", default="output.tsv", help="Name of the output file, default output.tsv") + parser.add_argument("--annotation", required=True, help="Path to the annotation file used to generate the data") + args = parser.parse_args() + sys.exit(run(args.counts_matrix, args.annotation, args.output)) + + +def run(st_data_file: str, annotation: str, output_file: str) -> int: if not os.path.isfile(st_data_file) or not os.path.isfile(annotation): print("Error, input file(s) not present or invalid format") return 1 @@ -80,9 +89,4 @@ def main(st_data_file: str, annotation: str, output_file: str) -> int: if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format") - parser.add_argument("--output", default="output.tsv", help="Name of the output file, default output.tsv") - parser.add_argument("--annotation", required=True, help="Path to the annotation file used to generate the data") - args = parser.parse_args() - sys.exit(main(args.counts_matrix, args.annotation, args.output)) + main() diff --git a/scripts/filter_gene_type_matrix.py b/stpipeline/scripts/filter_gene_type_matrix.py similarity index 91% rename from scripts/filter_gene_type_matrix.py rename to stpipeline/scripts/filter_gene_type_matrix.py index 554b557..c184c37 100644 --- a/scripts/filter_gene_type_matrix.py +++ b/stpipeline/scripts/filter_gene_type_matrix.py @@ -24,7 +24,29 @@ from typing import List -def main(counts_matrix: str, gene_types_keep: List[str], outfile: str, annotation: str, ensembl_ids: bool) -> int: +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format") + parser.add_argument("--outfile", help="Name of the output file") + parser.add_argument( + "--gene-types-keep", + required=True, + nargs="+", + type=str, + help="List of Ensembl gene types to keep (E.x protein_coding lincRNA", + ) + parser.add_argument("--annotation", help="The Ensembl annotation file", required=True, type=str) + parser.add_argument( + "--ensembl-ids", + action="store_true", + default=False, + help="Pass this parameter if the genes in the matrix " "are named with Ensembl Ids instead of gene names", + ) + args = parser.parse_args() + sys.exit(run(args.counts_matrix, args.gene_types_keep, args.outfile, args.annotation, args.ensembl_ids)) + + +def run(counts_matrix: str, gene_types_keep: List[str], outfile: str, annotation: str, ensembl_ids: bool) -> int: if not os.path.isfile(counts_matrix) or not os.path.isfile(annotation): print("Error, input file(s) not present or invalid format") return 1 @@ -66,22 +88,4 @@ def main(counts_matrix: str, gene_types_keep: List[str], outfile: str, annotatio if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format") - parser.add_argument("--outfile", help="Name of the output file") - parser.add_argument( - "--gene-types-keep", - required=True, - nargs="+", - type=str, - help="List of Ensembl gene types to keep (E.x protein_coding lincRNA", - ) - parser.add_argument("--annotation", help="The Ensembl annotation file", required=True, type=str) - parser.add_argument( - "--ensembl-ids", - action="store_true", - default=False, - help="Pass this parameter if the genes in the matrix " "are named with Ensembl Ids instead of gene names", - ) - args = parser.parse_args() - sys.exit(main(args.counts_matrix, args.gene_types_keep, args.outfile, args.annotation, args.ensembl_ids)) + main() diff --git a/scripts/merge_fastq.py b/stpipeline/scripts/merge_fastq.py similarity index 95% rename from scripts/merge_fastq.py rename to stpipeline/scripts/merge_fastq.py index 579f57d..19e1a52 100644 --- a/scripts/merge_fastq.py +++ b/stpipeline/scripts/merge_fastq.py @@ -37,7 +37,22 @@ def run_command(command: List[str], out: Union[int, IO[bytes]] = subprocess.PIPE raise e -def main(run_path: str, indexes: List[str], out_path: str) -> int: +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--run-path", required=True, help="Path to the run folder") + parser.add_argument("--out-path", required=True, help="Path to the output folder") + parser.add_argument( + "--identifiers", + required=True, + nargs="+", + type=str, + help="List of identifiers for each sample (E.x. S1 S2 S3 S4)", + ) + args = parser.parse_args() + sys.exit(run(args.run_path, args.identifiers, args.out_path)) + + +def run(run_path: str, indexes: List[str], out_path: str) -> int: if not os.path.isdir(run_path) or not os.path.isdir(out_path): print("Error, either run_path or out_path folders do not exist") return 1 @@ -82,15 +97,4 @@ def main(run_path: str, indexes: List[str], out_path: str) -> int: if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument("--run-path", required=True, help="Path to the run folder") - parser.add_argument("--out-path", required=True, help="Path to the output folder") - parser.add_argument( - "--identifiers", - required=True, - nargs="+", - type=str, - help="List of identifiers for each sample (E.x. S1 S2 S3 S4)", - ) - args = parser.parse_args() - sys.exit(main(args.run_path, args.identifiers, args.out_path)) + main() diff --git a/scripts/multi_qa.py b/stpipeline/scripts/multi_qa.py similarity index 97% rename from scripts/multi_qa.py rename to stpipeline/scripts/multi_qa.py index e12dc02..a7503ef 100644 --- a/scripts/multi_qa.py +++ b/stpipeline/scripts/multi_qa.py @@ -103,7 +103,21 @@ def create_pca_plot(data: Any, labels: List[str], title: str, outfile: str) -> N fig.savefig(outfile, format="pdf", dpi=90) -def main(counts_table_files: List[str], outdir: str, use_log: bool) -> int: +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + "counts_matrix_files", nargs="+", help="One or more matrices with gene counts (genes as columns) in TSV format" + ) + parser.add_argument("--outdir", default=None, help="Path to the output directory") + parser.add_argument( + "--use-log-scale", action="store_true", default=False, help="Convert counts to log space for the correlation" + ) + args = parser.parse_args() + + sys.exit(run(args.counts_matrix_files, args.outdir, args.use_log_scale)) + + +def run(counts_table_files: List[str], outdir: str, use_log: bool) -> int: if len(counts_table_files) == 0 or any(not os.path.isfile(f) for f in counts_table_files): print("Error, input file(s) not present or invalid format") return 1 @@ -206,14 +220,4 @@ def main(counts_table_files: List[str], outdir: str, use_log: bool) -> int: if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument( - "counts_matrix_files", nargs="+", help="One or more matrices with gene counts (genes as columns) in TSV format" - ) - parser.add_argument("--outdir", default=None, help="Path to the output directory") - parser.add_argument( - "--use-log-scale", action="store_true", default=False, help="Convert counts to log space for the correlation" - ) - args = parser.parse_args() - - sys.exit(main(args.counts_matrix_files, args.outdir, args.use_log_scale)) + main() diff --git a/scripts/st_pipeline_run.py b/stpipeline/scripts/st_pipeline_run.py similarity index 100% rename from scripts/st_pipeline_run.py rename to stpipeline/scripts/st_pipeline_run.py diff --git a/scripts/st_qa.py b/stpipeline/scripts/st_qa.py similarity index 98% rename from scripts/st_qa.py rename to stpipeline/scripts/st_qa.py index ab49967..5602105 100644 --- a/scripts/st_qa.py +++ b/stpipeline/scripts/st_qa.py @@ -100,7 +100,14 @@ def histogram( raise RuntimeError("Failed to create histogram") from e -def main(input_data: str) -> int: +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format") + args = parser.parse_args() + sys.exit(run(args.counts_matrix)) + + +def run(input_data: str) -> int: # Parse the data counts_table = pd.read_table(input_data, sep="\t", header=0, index_col=0) @@ -268,7 +275,4 @@ def main(input_data: str) -> int: if __name__ == "__main__": - parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument("counts_matrix", help="Matrix with gene counts (genes as columns) in TSV format") - args = parser.parse_args() - sys.exit(main(args.counts_matrix)) + main() diff --git a/tests/filter_test.py b/tests/filter_test.py index 22ea579..b49caff 100644 --- a/tests/filter_test.py +++ b/tests/filter_test.py @@ -5,7 +5,7 @@ import pytest import dnaio from unittest.mock import Mock, patch -from stpipeline.common.filter import filter_input_data +from stpipeline.common.filter import filter_input_data, bam_header def generate_test_fastq(filepath, records): @@ -86,4 +86,4 @@ def test_filter_input_data(mock_alignment_file, setup_fastq_files, tmp_path): assert total_reads == 6 assert remaining_reads < total_reads - mock_alignment_file.assert_called_once_with(str(out_file), "wb") + mock_alignment_file.assert_called_once_with(str(out_file), "wb", header=bam_header) diff --git a/tests/integration_test.py b/tests/integration_test.py index 19c84cf..963efd2 100755 --- a/tests/integration_test.py +++ b/tests/integration_test.py @@ -103,6 +103,7 @@ def setup_pipeline(): rmtree(outdir, ignore_errors=True) +@pytest.fixture def test_pipeline_run(setup_pipeline): """Test the full pipeline run.""" data = setup_pipeline @@ -110,7 +111,7 @@ def test_pipeline_run(setup_pipeline): try: check_call( [ - "st_pipeline_run.py", + "st_pipeline_run", "--verbose", "--no-clean-up", "--star-two-pass-mode", @@ -139,7 +140,7 @@ def test_pipeline_run(setup_pipeline): ] ) except Exception as e: - pytest.fail(f"st_pipeline_run.py execution failed: {e}") + pytest.fail(f"st_pipeline_run execution failed: {e}") # Verify output files datafile = os.path.join(data["outdir"], "test_stdata.tsv") @@ -150,17 +151,17 @@ def test_pipeline_run(setup_pipeline): assert os.path.exists(readsfile) assert os.path.getsize(readsfile) > 1024 assert os.path.exists(statsfile) + return datafile -def test_st_qa(setup_pipeline): - """Test the st_qa.py.""" - data = setup_pipeline - datafile = os.path.join(data["outdir"], "test_stdata.tsv") +def test_st_qa(test_pipeline_run): + """Test the st_qa.""" + datafile = test_pipeline_run try: - check_call(["st_qa.py", datafile]) + check_call(["st_qa", datafile]) except Exception as e: - pytest.fail(f"st_qa.py execution failed: {e}") + pytest.fail(f"st_qa execution failed: {e}") clean_name = os.path.basename(datafile).split(".")[0] assert os.path.exists(f"{clean_name}_qa_stats.txt") @@ -178,15 +179,14 @@ def test_st_qa(setup_pipeline): assert os.path.exists(f"{clean_name}_hist_spots_gene_2.pdf") -def test_multi_qa(setup_pipeline): - """Test the multi_qa.py.""" - data = setup_pipeline - datafile = os.path.join(data["outdir"], "test_stdata.tsv") +def test_multi_qa(test_pipeline_run): + """Test the multi_qa.""" + datafile = test_pipeline_run try: - check_call(["multi_qa.py", datafile, datafile, datafile, datafile]) + check_call(["multi_qa", datafile, datafile, datafile, datafile]) except Exception as e: - pytest.fail(f"multi_qa.py execution failed: {e}") + pytest.fail(f"multi_qa execution failed: {e}") assert os.path.exists("violin_plot_reads.pdf") assert os.path.exists("violin_plot_genes.pdf") @@ -196,22 +196,6 @@ def test_multi_qa(setup_pipeline): assert os.path.exists("pca.pdf") -def test_adjust_matrix_coordinates_help(): - """Test for adjust_matrix_coordinates.py""" - try: - check_call(["adjust_matrix_coordinates.py", "--help"]) - except Exception as e: - pytest.fail(f"adjust_matrix_coordinates.py failed: {e}") - - -def test_merge_fastq_help(): - """Test for merge_fastq.py""" - try: - check_call(["merge_fastq.py", "--help"]) - except Exception as e: - pytest.fail(f"merge_fastq.py failed: {e}") - - @pytest.fixture def setup_test_files(tmp_path): """Set up fake data for the test.""" @@ -242,11 +226,11 @@ def setup_test_files(tmp_path): def test_adjust_matrix_coordinates(setup_test_files, tmp_path): counts_matrix, coordinates_file, outfile = setup_test_files - # Call the adjust_matrix_coordinates.py script + # Call the adjust_matrix_coordinates script try: check_call( [ - "adjust_matrix_coordinates.py", + "adjust_matrix_coordinates", str(counts_matrix), "--coordinates-file", str(coordinates_file), @@ -256,7 +240,7 @@ def test_adjust_matrix_coordinates(setup_test_files, tmp_path): ] ) except Exception as e: - pytest.fail(f"adjust_matrix_coordinates.py execution failed: {e}") + pytest.fail(f"adjust_matrix_coordinates execution failed: {e}") # Verify the output file assert outfile.exists(), "Output file should be created" @@ -302,10 +286,10 @@ def test_merge_fastq(setup_fastq_files, tmp_path): # Run the script try: check_call( - ["merge_fastq.py" "--run-path", str(run_path), "--out-path", str(out_path), "--identifiers", *identifiers] + ["merge_fastq", "--run-path", str(run_path), "--out-path", str(out_path), "--identifiers", *identifiers] ) except Exception as e: - pytest.fail(f"merge_fastq.py execution failed: {e}") + pytest.fail(f"merge_fastq execution failed: {e}") # Verify that the merged files are created in the output directory for idx in identifiers: @@ -318,7 +302,7 @@ def test_merge_fastq(setup_fastq_files, tmp_path): @pytest.fixture def setup_filter_gene_type_data(tmp_path): """ - Fixture to create fake data for the filter_gene_type_matrix.py script. + Fixture to create fake data for the filter_gene_type_matrix script. """ # Create a temporary counts matrix file counts_matrix = tmp_path / "counts_matrix.tsv" @@ -332,13 +316,14 @@ def setup_filter_gene_type_data(tmp_path): # Create a temporary annotation file annotation_file = tmp_path / "annotation.gtf" - annotation_data = """\ -1 gene_id "gene1"; gene_name "Gene1"; gene_type "protein_coding"; -1 gene_id "gene2"; gene_name "Gene2"; gene_type "lincRNA"; -1 gene_id "gene3"; gene_name "Gene3"; gene_type "protein_coding"; -""" + content = ( + "##gff-version 3\n" + "chr1\tsource\tfeature\t100\t200\t.\t+\t.\tgene_id=gene1;gene_name=gene1;gene_type=protein_coding\n" + "chr1\tsource\tfeature\t300\t400\t.\t-\t.\tgene_id=gene2;gene_name=gene2;gene_type=lincRNA\n" + "chr2\tsource\tfeature\t500\t600\t.\t+\t.\tgene_id=gene3;gene_name=gene3;gene_type=protein_coding\n" + ) with open(annotation_file, "w") as f: - f.write(annotation_data) + f.write(content) # Output file path outfile = tmp_path / "filtered_counts_matrix.tsv" @@ -353,7 +338,7 @@ def test_filter_gene_type_matrix(setup_filter_gene_type_data, tmp_path): try: check_call( [ - "filter_gene_type_matrix.py", + "filter_gene_type_matrix", str(counts_matrix), "--annotation", str(annotation_file), @@ -364,22 +349,22 @@ def test_filter_gene_type_matrix(setup_filter_gene_type_data, tmp_path): ] ) except Exception as e: - pytest.fail(f"filter_gene_type_matrix.py execution failed: {e}") + pytest.fail(f"filter_gene_type_matrix execution failed: {e}") # Validate the output file assert outfile.exists(), "Filtered counts matrix should exist" filtered_df = pd.read_csv(outfile, sep="\t", index_col=0) # Validate the filtered data - assert "gene1" in filtered_df.columns, "gene1 should be present in the filtered data" - assert "gene3" in filtered_df.columns, "gene3 should be present in the filtered data" - assert "gene2" not in filtered_df.columns, "gene2 should be filtered out" + assert "gene1" in filtered_df.columns.to_list(), "gene1 should be present in the filtered data" + assert "gene3" in filtered_df.columns.to_list(), "gene3 should be present in the filtered data" + assert "gene2" not in filtered_df.columns.to_list(), "gene2 should be filtered out" @pytest.fixture def setup_convert_ensembl_data(tmp_path): """ - Fixture to create fake data for the convertEnsemblToNames.py script. + Fixture to create fake data for the convertEnsemblToNames script. """ # Create a temporary counts matrix file counts_matrix = tmp_path / "counts_matrix.tsv" @@ -393,13 +378,14 @@ def setup_convert_ensembl_data(tmp_path): # Create a temporary annotation file annotation_file = tmp_path / "annotation.gtf" - annotation_data = """\ -1 gene_id "ENSG000001"; gene_name "GeneA"; -1 gene_id "ENSG000002"; gene_name "GeneB"; -1 gene_id "ENSG000003"; gene_name "GeneC"; -""" + content = ( + "##gff-version 3\n" + "chr1\tsource\tfeature\t100\t200\t.\t+\t.\tgene_id=ENSG000001;gene_name=Gene1\n" + "chr1\tsource\tfeature\t300\t400\t.\t-\t.\tgene_id=ENSG000002;gene_name=Gene2\n" + "chr2\tsource\tfeature\t500\t600\t.\t+\t.\tgene_id=ENSG000003;gene_name=Gene3\n" + ) with open(annotation_file, "w") as f: - f.write(annotation_data) + f.write(content) # Output file path output_file = tmp_path / "output_counts_matrix.tsv" @@ -414,7 +400,7 @@ def test_convert_ensembl_to_names(setup_convert_ensembl_data): try: check_call( [ - "convertEnsemblToNames.py", + "convertEnsemblToNames", str(counts_matrix), "--annotation", str(annotation_file), @@ -423,16 +409,16 @@ def test_convert_ensembl_to_names(setup_convert_ensembl_data): ] ) except Exception as e: - pytest.fail(f"convertEnsemblToNames.py execution failed: {e}") + pytest.fail(f"convertEnsemblToNames execution failed: {e}") # Validate the output file assert output_file.exists(), "Converted counts matrix should exist" converted_df = pd.read_csv(output_file, sep="\t", index_col=0) # Validate the converted data - assert "GeneA" in converted_df.columns, "GeneA should be present in the converted data" - assert "GeneB" in converted_df.columns, "GeneB should be present in the converted data" - assert "GeneC" in converted_df.columns, "GeneC should be present in the converted data" - assert "ENSG000001" not in converted_df.columns, "ENSG000001 should not be present in the converted data" - assert "ENSG000002" not in converted_df.columns, "ENSG000002 should not be present in the converted data" - assert "ENSG000003" not in converted_df.columns, "ENSG000003 should not be present in the converted data" + assert "Gene1" in converted_df.columns.to_list(), "GeneA should be present in the converted data" + assert "Gene2" in converted_df.columns.to_list(), "GeneB should be present in the converted data" + assert "Gene3" in converted_df.columns.to_list(), "GeneC should be present in the converted data" + assert "ENSG000001" not in converted_df.columns.to_list(), "ENSG000001 should not be present in the converted data" + assert "ENSG000002" not in converted_df.columns.to_list(), "ENSG000002 should not be present in the converted data" + assert "ENSG000003" not in converted_df.columns.to_list(), "ENSG000003 should not be present in the converted data" diff --git a/tests/utils_test.py b/tests/utils_test.py index 1f33d5c..4d40a73 100644 --- a/tests/utils_test.py +++ b/tests/utils_test.py @@ -27,14 +27,13 @@ def temp_file(tmp_path): def test_which_program(): program = "python" result = which_program(program) - assert result is not None - assert os.path.basename(result) == program + assert result is True def test_which_program_not_found(): program = "nonexistent_program" result = which_program(program) - assert result is None + assert result is False def test_timestamper():