Skip to content

Commit

Permalink
Merge pull request #116 from epifluidlab/fix_coverage_frag_len
Browse files Browse the repository at this point in the history
Fix coverage and frag_length modules
  • Loading branch information
jamesli124 authored Dec 5, 2024
2 parents 702f6d2 + 5a822cb commit b814de7
Show file tree
Hide file tree
Showing 12 changed files with 539 additions and 349 deletions.
28 changes: 28 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,33 @@ The format is based on
and this project adheres to
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.8.0] - 2024-12-04

### Removed
- `finaletoolkit.frag.frag_length_bins` no longer has the `contig_by_contig`
option. This never had any functionality.
- `finaletoolkit.frag.frag_length_bins` no longer generates a text-based
histogram.

### Fixed
- `contig_sizes` option included for `cleavage-profile` CLI command.
- `normalize` option for `coverage` fixed so it no longer normalizes twice
- `normalize=False` for `coverage` runs much faster
- misc typehints and docstrings

### Changed
- `finaletoolkit.frag.frag_length_bins` uses a dict based implementation
that is more memory efficient.
- `finaletoolkit.frag.frag_length_bins` and
`finaletoolkit.frag.frag_length_intervals` now take `min_length` and
`max_length` keyword args to only consider fragments of certain lengths.
- flags for `frag-length-bins` and `frag-length-intervals` CLI commands updated to match Python API
- `coverage` default argument for `normalize` changed to `False`
- `coverage` default argument for `scale_factor` changed to 1.

### Added
- `finaletoolkit.frag.frag_length_bins` can generate a histogram figure

## [0.7.8] - 2024-11-28

### Fixed
Expand All @@ -17,6 +44,7 @@ and this project adheres to
- `normalize` keyword argument and `--normalize` flag to `finaletoolkit.frag.coverage` function and `finaletoolkit coverage` subcommand, respectively. Setting this argument/flag to true results in the output
being normalized by the total coverage, ignoring `scale_factor` if specified.
- `--intersect-policy` or `-p` flag added to `finaletoolkit coverage` subcommand.
- tests for `frag_length` module

## [0.7.7] - 2024-11-27

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dependencies = [
"pandas",
"statsmodels",
"loess",
"matplotlib",
]
dynamic = ["version"]

Expand Down
63 changes: 46 additions & 17 deletions src/finaletoolkit/cli/main_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,16 @@ def main_cli_parser():
'-n',
'--normalize',
action='store_true',
help="If flag set, ignores any user inputed scale factor and "
"normalizes output by total coverage."
help="If flag set, multiplies by user inputed scale factor if"
" given and normalizes output by total coverage. May lead to "
"longer execution time for high-throughput data."
)
cli_coverage.add_argument(
'-s',
'--scale-factor',
default=1e6,
default=1.,
type=float,
help='Scale factor for coverage values.')
help='Scale factor for coverage values. Default is 1.')
cli_coverage.add_argument(
'-p',
'--intersect_policy',
Expand Down Expand Up @@ -122,6 +123,24 @@ def main_cli_parser():
type=int,
help='Specify the 0-based left-most coordinate of the interval '
'to select fragments from. (Must also specify --contig.)')
cli_frag_length_bins.add_argument(
'-E',
'--stop',
help='Specify the 1-based right-most coordinate of the interval'
' to select fragments from. (Must also specify --contig.)',
type=int)
cli_frag_length_bins.add_argument(
'-min',
'--min-length',
default=0,
type=int,
help='Minimum length for a fragment to be included in fragment length.')
cli_frag_length_bins.add_argument(
'-max',
'--max-length',
default=None,
type=int,
help='Maximum length for a fragment to be included in fragment length.')
cli_frag_length_bins.add_argument(
'-p',
'--intersect_policy',
Expand All @@ -131,15 +150,10 @@ def main_cli_parser():
type=str,
help='Specifies what policy is used to include fragments in the'
' given interval. See User Guide for more information.')
cli_frag_length_bins.add_argument(
'-E',
'--stop',
help='Specify the 1-based right-most coordinate of the interval'
' to select fragments from. (Must also specify --contig.)',
type=int)
cli_frag_length_bins.add_argument(
'--bin-size',
type=int,
default=1,
help='Specify the size of the bins to group fragment lengths '
'into.')
cli_frag_length_bins.add_argument(
Expand All @@ -149,14 +163,15 @@ def main_cli_parser():
type=str,
help='A .TSV file containing containing fragment lengths binned'
' according to the specified bin size.')
cli_frag_length_bins.add_argument(
'--contig-by-contig',
action='store_true',
help='Placeholder, not implemented.')
cli_frag_length_bins.add_argument(
'--histogram',
action='store_true',
help='Enable histogram mode to display histogram in terminal.')
help='Enable histogram mode to display histogram.')
cli_frag_length_bins.add_argument(
'--histogram-path',
default=None,
help='Path to store histogram.',
)
cli_frag_length_bins.add_argument(
'-q',
'--quality_threshold',
Expand Down Expand Up @@ -185,6 +200,18 @@ def main_cli_parser():
'interval_file',
help='Path to a BED file containing intervals to retrieve '
'fragment length summary statistics over.')
cli_frag_length_intervals.add_argument(
'-min',
'--min-length',
default=0,
type=int,
help='Minimum length for a fragment to be included in fragment length.')
cli_frag_length_intervals.add_argument(
'-max',
'--max-length',
default=None,
type=int,
help='Maximum length for a fragment to be included in fragment length.')
cli_frag_length_intervals.add_argument(
'-p',
'--intersect_policy',
Expand Down Expand Up @@ -236,8 +263,10 @@ def main_cli_parser():
help='Path to a BED file containing intervals to calculates cleavage '
'proportion over.')
cli_cleavage_profile.add_argument(
'-o',
'--output_file',
'chrom_sizes',
help='A .chrom.sizes file containing chromosome names and sizes.')
cli_cleavage_profile.add_argument(
'output_file',
default='-',
help='A bigWig file containing the cleavage proportion results over '
'the intervals specified in interval file.',)
Expand Down
27 changes: 24 additions & 3 deletions src/finaletoolkit/frag/cleavage_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,30 @@ def _cli_cleavage_profile(
verbose: Union[bool, int]=0
):
"""
Function called when running cleavage profile subcommand in cli.
Multithreaded implementation over intervals in bed.
Multithreaded cleavage profile implementation over intervals in a
BED file.
Parameters
---------
input_file: str or pathlike
SAM, BAM, CRAM, or FRAG file with fragment information.
interval_file: str or pathlike
BED file containing intervals
chrom_sizes: str or pathlike
Tab-delimited file with name and lengths of each contig.
left: int
Amount to subtract from start coordinate. Useful if only given
coordinates of CpG.
right: int
Amount to add to stop coordinate.
fraction_low: int
Minimum fragment size to include
fraction_high: int
Maximum fragment size to include
quality_threshold: int
Minimum MAPQ
workers: int, default = 1
Number of processes to spawn
verbose: bool or int
"""

if (verbose):
Expand Down Expand Up @@ -289,7 +311,6 @@ def _cli_cleavage_profile(
elif (output_file.endswith('.bed.gz')
or output_file.endswith('bedgraph.gz')
or output_file == "-"):
# XXX: writing to stdout is untested and may not work.
with gzip.open(output_file, 'wt') as bedgraph:
for interval_score in interval_scores:
contigs = interval_score['contig']
Expand Down
Loading

0 comments on commit b814de7

Please sign in to comment.