Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
jfnavarro committed Jan 15, 2025
1 parent 30a1cd6 commit b55375e
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 37 deletions.
4 changes: 2 additions & 2 deletions data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ the Mus Musculus Ensembl annotation version 86. The annotation file used Mus Mus
A contaminant genome STAR index was used generated from the Ensembl non coding RNA Mus musculus version 86.
The IDs file used to demultiplex were the 1000L2 and 1000L5.

The following settings were used:
The following settings were used (NOTE that the name of the parameters in the example are for version 1.3.1):

```bash
st_pipeline_run.py \
st_pipeline_run \
--output-folder OUTPUT \
--ids id.txt \
--ref-map path_to_genome_index \
Expand Down
2 changes: 2 additions & 0 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ st_pipeline_run [options] fastq_file_fw fastq_file_rv
having a barcode composed of multiple sequences in the
reador when the barcode needs to be trimmed out.
Trimmng sequences can be given several times.
--demultiplexing-chunk-size [INT]
Chunk size for parallel processing (number of reads assigned to each thread) (default: 10000)
--htseq-mode [STRING]
Mode of annotation when using htseq-count. Modes =
{union, intersection-nonempty(default), intersection-
Expand Down
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions stpipeline/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,20 @@ def get_timestamp(self) -> float:
return ts


def timestemp_to_str(timestamp: float) -> str:
"""
Convert a time.time() timestamp into a string representation of minutes and seconds.
Args:
timestamp: The timestamp to convert, typically from time.time().
Returns:
A string representing the time in the format "minutes:seconds".
"""
minutes, seconds = divmod(int(timestamp), 60)
return f"{minutes}:{seconds:02}"


def safe_remove(filename: Optional[str]) -> None:
"""
Safely removes a file if it exists.
Expand Down
75 changes: 41 additions & 34 deletions stpipeline/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
get_star_version,
get_taggd_count_version,
safe_remove,
timestemp_to_str,
which_program,
)
from stpipeline.core.annotation import annotateReads
Expand Down Expand Up @@ -912,18 +913,24 @@ def createLogger(self) -> None:
logger.info(f"Using contamination filter STAR index: {self.contaminant_index}")
logger.info(f"CPU Nodes: {self.threads}")

if not self.disable_barcode:
logger.info(f"Ids(barcodes) file: {self.ids}")
logger.info(f"TaggD allowed mismatches: {self.allowed_missed}")
logger.info(f"TaggD kmer size: {self.allowed_kmer}")
logger.info(f"TaggD overhang: {self.overhang}")
logger.info(f"TaggD metric: {self.taggd_metric}")
if self.taggd_multiple_hits_keep_one:
logger.info("TaggD multiple hits keep one (random) is enabled")
if self.taggd_trim_sequences is not None:
logger.info(f"TaggD trimming from the barcodes: {'-'.join(str(x) for x in self.taggd_trim_sequences)}")
logger.info(f"TaggD chunk size: {self.taggd_chunk_size }")
if not self.disable_trimming:
logger.info("Quality and trimming settings")
if self.remove_polyA_distance > 0:
logger.info(f"Removing polyA sequences of a length of at least: {self.remove_polyA_distance}")
if self.remove_polyT_distance > 0:
logger.info(f"Removing polyT sequences of a length of at least: {self.remove_polyT_distance}")
if self.remove_polyG_distance > 0:
logger.info(f"Removing polyG sequences of a length of at least: {self.remove_polyG_distance}")
if self.remove_polyC_distance > 0:
logger.info(f"Removing polyC sequences of a length of at least: {self.remove_polyC_distance}")
if self.remove_polyN_distance > 0:
logger.info(f"Removing polyN sequences of a length of at least: {self.remove_polyN_distance}")
logger.info(f"Allowing {self.adaptor_missmatches} mismatches when removing homopolymers")
logger.info(f"Remove reads whose AT content is {self.filter_AT_content}%")
logger.info(f"Remove reads whose GC content is {self.filter_GC_content}%")

if not self.disable_mapping:
logger.info("Mapping settings")
logger.info(f"Mapping reverse trimming: {self.trimming_rv}")
logger.info(f"Mapping inverse reverse trimming: {self.inverse_trimming_rv}")
logger.info("Mapping tool: STAR")
Expand All @@ -937,7 +944,21 @@ def createLogger(self) -> None:
if self.two_pass_mode:
logger.info("Using the STAR 2-pass mode for the mapping step")

if not self.disable_barcode:
logger.info("Demultiplexing settings")
logger.info(f"Ids(barcodes) file: {self.ids}")
logger.info(f"TaggD allowed mismatches: {self.allowed_missed}")
logger.info(f"TaggD kmer size: {self.allowed_kmer}")
logger.info(f"TaggD overhang: {self.overhang}")
logger.info(f"TaggD metric: {self.taggd_metric}")
if self.taggd_multiple_hits_keep_one:
logger.info("TaggD multiple hits keep one (random) is enabled")
if self.taggd_trim_sequences is not None:
logger.info(f"TaggD trimming from the barcodes: {'-'.join(str(x) for x in self.taggd_trim_sequences)}")
logger.info(f"TaggD chunk size: {self.taggd_chunk_size }")

if not self.disable_annotation:
logger.info("Annotation settings")
logger.info("Annotation tool: HTSeq")
logger.info(f"Annotation mode: {self.htseq_mode}")
logger.info(f"Annotation strandness: {self.strandness}")
Expand All @@ -951,6 +972,7 @@ def createLogger(self) -> None:
logger.info(f"Using the following points: {' '.join(str(p) for p in self.saturation_points)}")

if not self.disable_umi:
logger.info("UMI settings")
logger.info(f"UMIs start position: {self.umi_start_position}")
logger.info(f"UMIs end position: {self.umi_end_position}")
logger.info(f"UMIs allowed mismatches: {self.umi_allowed_mismatches}")
Expand All @@ -963,21 +985,6 @@ def createLogger(self) -> None:
if self.umi_filter:
logger.info(f"UMIs using filter: {self.umi_filter_template}")

if not self.disable_trimming:
if self.remove_polyA_distance > 0:
logger.info(f"Removing polyA sequences of a length of at least: {self.remove_polyA_distance}")
if self.remove_polyT_distance > 0:
logger.info(f"Removing polyT sequences of a length of at least: {self.remove_polyT_distance}")
if self.remove_polyG_distance > 0:
logger.info(f"Removing polyG sequences of a length of at least: {self.remove_polyG_distance}")
if self.remove_polyC_distance > 0:
logger.info(f"Removing polyC sequences of a length of at least: {self.remove_polyC_distance}")
if self.remove_polyN_distance > 0:
logger.info(f"Removing polyN sequences of a length of at least: {self.remove_polyN_distance}")
logger.info(f"Allowing {self.adaptor_missmatches} mismatches when removing homopolymers")
logger.info(f"Remove reads whose AT content is {self.filter_AT_content}%")
logger.info(f"Remove reads whose GC content is {self.filter_GC_content}%")

def run(self) -> None:
"""
Runs the whole pipeline given the parameters present.
Expand All @@ -999,7 +1006,7 @@ def run(self) -> None:
# START PIPELINE
# =================================================================
start_exe_time = globaltime.get_timestamp()
logger.info(f"Starting the pipeline: {start_exe_time}")
logger.info(f"Starting the pipeline: {timestemp_to_str(start_exe_time)}")

# =================================================================
# STEP: FILTERING
Expand All @@ -1008,7 +1015,7 @@ def run(self) -> None:
# Get the barcode length
barcode_length = len(list(read_barcode_file(self.ids).values())[0].sequence)
if not self.disable_trimming:
logger.info(f"Start filtering raw reads {globaltime.get_timestamp()}")
logger.info(f"Start filtering raw reads {timestemp_to_str(globaltime.get_timestamp())}")
try:
stats = filter_input_data(
self.fastq_fw,
Expand Down Expand Up @@ -1050,7 +1057,7 @@ def run(self) -> None:
if self.contaminant_index:
# To remove contaminants sequence we align the reads to the contaminant genome
# and keep the un-mapped reads
logger.info(f"Starting contaminant filter alignment {globaltime.get_timestamp()}")
logger.info(f"Starting contaminant filter alignment {timestemp_to_str(globaltime.get_timestamp())}")
try:
# Make the contaminant filter call
alignReads(
Expand Down Expand Up @@ -1106,7 +1113,7 @@ def run(self) -> None:
# STEP: Maps against the genome using STAR
# =================================================================
if not self.disable_mapping:
logger.info(f"Starting genome alignment {globaltime.get_timestamp()}")
logger.info(f"Starting genome alignment {timestemp_to_str(globaltime.get_timestamp())}")
input_reads = FILENAMES["contaminated_clean"] if self.contaminant_index else FILENAMES["quality_trimmed_R2"]
try:
# Make the alignment call
Expand Down Expand Up @@ -1147,7 +1154,7 @@ def run(self) -> None:
# STEP: DEMULTIPLEX READS Map against the barcodes
# =================================================================
if not self.disable_barcode:
logger.info(f"Starting barcode demultiplexing {globaltime.get_timestamp()}")
logger.info(f"Starting barcode demultiplexing {timestemp_to_str(globaltime.get_timestamp())}")
try:
stats = barcodeDemultiplexing( # type: ignore
FILENAMES["mapped"],
Expand Down Expand Up @@ -1228,7 +1235,7 @@ def run(self) -> None:
if not self.transcriptome
else self.qa_stats.reads_after_demultiplexing
)
logger.info(f"Starting computing saturation points {globaltime.get_timestamp()}")
logger.info(f"Starting computing saturation points {timestemp_to_str(globaltime.get_timestamp())}")
try:
compute_saturation(
reads,
Expand All @@ -1249,7 +1256,7 @@ def run(self) -> None:
# STEP: Create dataset and remove duplicates
# =================================================================
if os.path.isfile(FILENAMES["annotated"]):
logger.info(f"Starting creating dataset {globaltime.get_timestamp()}")
logger.info(f"Starting creating dataset {timestemp_to_str(globaltime.get_timestamp())}")
try:
stats = createDataset( # type: ignore
FILENAMES["annotated"],
Expand Down Expand Up @@ -1287,4 +1294,4 @@ def run(self) -> None:

finish_exe_time = globaltime.get_timestamp()
total_exe_time = finish_exe_time - start_exe_time
logger.info(f"Total Execution Time: {total_exe_time}")
logger.info(f"Total Execution Time: {timestemp_to_str(total_exe_time)}")

0 comments on commit b55375e

Please sign in to comment.