Skip to content

Add analyze_kive_batches scripts #1286

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 55 commits into from
Apr 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
55 commits
Select commit Hold shift + click to select a range
1212fe0
Start moving the analyze_kive_batches scripts into MiCall
Donaim Apr 29, 2025
e06298a
small improvements to analyze_kive_batches
Donaim Apr 29, 2025
5e1f500
add all subcommand
Donaim Apr 29, 2025
9f93699
add get_batch_runs.py
Donaim Apr 29, 2025
145fb55
add generate_setup_stage_ninjafile.py
Donaim Apr 29, 2025
c829073
fix padding in ninjamaker
Donaim Apr 29, 2025
aebdf19
start implementing run_all
Donaim Apr 29, 2025
0b43cb2
fix newlines stripping
Donaim Apr 29, 2025
2c8d832
fix small bugs
Donaim Apr 29, 2025
c1368f2
improve sample data
Donaim Apr 29, 2025
c38a1ed
implement sample collection
Donaim Apr 29, 2025
19b51ad
implement combination
Donaim Apr 29, 2025
ac7c2fa
add new_atomic_directory
Donaim Apr 29, 2025
0b7e337
move downloaded runs into a subdirectory
Donaim Apr 29, 2025
7795d78
use new_atomic_directory
Donaim Apr 29, 2025
be894b9
implement individual processing
Donaim Apr 29, 2025
d8c7935
fix subtle shadowing bug
Donaim Apr 29, 2025
0e4f11a
improve stats.json contents
Donaim Apr 29, 2025
44ac072
add a warning in make_stats_1
Donaim Apr 29, 2025
8b78fdb
put batches into a subdirectory
Donaim Apr 29, 2025
7e73b35
rename get_batch_runs to get_batch
Donaim Apr 29, 2025
68935ba
combine stats.json
Donaim Apr 29, 2025
6a54377
fix missing run_id
Donaim Apr 29, 2025
5a6b2fd
generate runs.txt
Donaim Apr 29, 2025
fb35a2a
use runs_txt instead of runs_json
Donaim Apr 29, 2025
1afd2da
add pandas-stubs to dev dependencies
Donaim Apr 30, 2025
ddbc653
implement stats aggregation
Donaim Apr 29, 2025
63426ca
add overlap analysis
Donaim Apr 30, 2025
717911f
calculate overlaps
Donaim Apr 30, 2025
505bf8e
improve stitching
Donaim Apr 30, 2025
a6d1146
rename make-stats-1 to make-stats
Donaim Apr 30, 2025
95100af
fix typing in make-stats
Donaim Apr 30, 2025
628483c
better field names
Donaim Apr 30, 2025
93de724
fix comments
Donaim Apr 30, 2025
ef2cf24
combine overlap data
Donaim Apr 30, 2025
694a7c9
aggregate overlap infos
Donaim Apr 30, 2025
2b8e416
rename assembler to app
Donaim Apr 30, 2025
199033b
rename category to safe_app
Donaim Apr 30, 2025
9470779
add sample_properties.toml to tests data
Donaim Apr 30, 2025
c27c896
implement make-properties
Donaim Apr 30, 2025
7aebf24
make properties during all_run
Donaim Apr 30, 2025
a0a75a5
join tables at the end
Donaim Apr 30, 2025
9d2b838
fix column name collision
Donaim Apr 30, 2025
20a7d59
fix join_tables
Donaim Apr 30, 2025
0ff15b4
fix unbound variable error
Donaim Apr 30, 2025
360640c
fix handling of missing stats.json
Donaim Apr 30, 2025
544ffd5
fix make_properties duplication
Donaim Apr 30, 2025
0cdf6a3
improve handling of incomplete runs
Donaim Apr 30, 2025
b58ab38
have a difference between still running and incomplete
Donaim Apr 30, 2025
82243db
improve run check
Donaim Apr 30, 2025
c67c4a4
do not always skip existing downloads
Donaim Apr 30, 2025
38d5cfc
add missing implicit dependencies
Donaim Apr 30, 2025
209d5c0
skip not good runs
Donaim Apr 30, 2025
f39f2d1
skip all runs not in state C
Donaim Apr 30, 2025
19d622a
do not escape the app name
Donaim Apr 30, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions micall/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
"micall/utils/fasta_to_fastq.py",
"micall/utils/append_primers.py",
"micall/utils/randomize_fastq.py",
"micall/utils/analyze_kive_batches/analyze_kive_batches.py",
]


Expand Down
2 changes: 2 additions & 0 deletions micall/tests/data/sample_batcheslist.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
noref-stitcher-comparison-0052
noref-stitcher-comparison-0053
12 changes: 12 additions & 0 deletions micall/tests/data/sample_properties.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
["micall:v7.17.0-818-ga64b78005 / denovo"]
is_denovo = true
is_haploflow = true
haploflow_kmer = 41
with_referenceless = true
referenceless_minimum_matches = 98

["micall:v7.17.0-728-g9eb594078 / denovo"]
is_denovo = true
is_haploflow = false
with_referenceless = true
referenceless_minimum_matches = 14
24 changes: 24 additions & 0 deletions micall/utils/analyze_kive_batches/aggregate_runs_overlaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

from pathlib import Path
import pandas as pd


def aggregate_runs_overlaps(input: Path, output: Path) -> None:
# 1. Read the CSV
df = pd.read_csv(input)

# 2. Group by "app" and compute the stats.
grouped = (
df
.groupby('app')
.agg(
avg_overlap_size = ('overlap_size', 'mean'),
avg_overlap_mismatches = ('overlap_mismatches', 'mean'),
avg_overlap_pvalue = ('overlap_pvalue', 'mean'),
overlap_count = ('app', 'count'),
)
.reset_index()
)

# 3. Write out to CSV
grouped.to_csv(output, index=False)
29 changes: 29 additions & 0 deletions micall/utils/analyze_kive_batches/aggregate_runs_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

from pathlib import Path
import pandas as pd


def aggregate_runs_stats(input: Path, output: Path) -> None:
# 1. Read the CSV
df = pd.read_csv(input)

# 2. Group by "app" and compute the stats.
grouped = (
df
.groupby('app')
.agg(
avg_concordance = ('concordance', 'mean'),
avg_depth = ('depth', 'mean'),
avg_mlen = ('mlen', 'mean'),
avg_total_mlen = ('total_mlen', 'mean'),
avg_overlap_count = ('overlap_count', 'mean'),
avg_number_of_contigs = ('number_of_contigs', 'mean'),
avg_contigs_size = ('avg_contigs_size', 'mean'),
avg_run_time = ('run_time', 'mean'),
run_count = ('app', 'count'),
)
.reset_index()
)

# 3. Write out to CSV
grouped.to_csv(output, index=False)
206 changes: 206 additions & 0 deletions micall/utils/analyze_kive_batches/analyze_kive_batches.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#! /usr/bin/env python

import argparse
import sys
from typing import Sequence
from pathlib import Path
import logging

from micall.utils.dir_path import DirPath
from micall.utils.user_error import UserError
from .logger import logger

from .download import download
from .make_stats import make_stats
from .get_batch import get_batch
from .run_all import run_all
from .combine_batches_runs import combine_batches_runs
from .combine_runs_stats import combine_runs_stats
from .combine_runs_overlaps import combine_runs_overlaps
from .extract_run_ids import extract_run_ids
from .aggregate_runs_stats import aggregate_runs_stats
from .aggregate_runs_overlaps import aggregate_runs_overlaps
from .stitch_contigs import stitch_contigs
from .make_properties import make_properties
from .join_tables import join_tables


def dir_path(string: str) -> DirPath:
path = Path(string)
if (not path.exists()) or path.is_dir():
return DirPath(path)
else:
raise UserError("Path %r is not a directory.", string)


def cli_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Analyze a kive run.")
mode_parsers = parser.add_subparsers(dest='subcommand',
title='subcommands',
required=True,
)

all = mode_parsers.add_parser("all", help="The main entry to this script. Runs all other subentries.")
all.add_argument("--batches-list", type=Path, required=True,
help="Path to a text-file containing the list of batches to be analyzed.")
all.add_argument("--root", type=dir_path, required=True,
help="Root directory for all output subdirectories.")
all.add_argument("--properties", type=Path, required=True,
help="Additional properties associated with particular images.")

sub = mode_parsers.add_parser("get-batch", help="Downloads a batch info.")
sub.add_argument("--batch", type=str, required=True,
help="The name of the batch to download the runs info for.")
sub.add_argument("--target", type=Path, required=True,
help="Target file where to put the runs info to.")

sub = mode_parsers.add_parser("combine-batches-runs", help="Extract batches run infos and combine them.")
sub.add_argument("--batches", type=Path, required=True, nargs=argparse.ONE_OR_MORE,
help="The downloaded batches files.")
sub.add_argument("--target", type=Path, required=True,
help="Target file where to put the runs info to.")

sub = mode_parsers.add_parser("combine-runs-stats", help="Combine all stats.json files into one.")
sub.add_argument("--root", type=dir_path, required=True,
help="Root directory for all output subdirectories.")
sub.add_argument("--runs-txt", type=Path, required=True,
help="The txt file with all the run ids in it.")
sub.add_argument("--target", type=Path, required=True,
help="Target file where to put the combine stats to.")

sub = mode_parsers.add_parser("combine-runs-overlaps", help="Combine all stats.json:overlaps data into one file.")
sub.add_argument("--root", type=dir_path, required=True,
help="Root directory for all output subdirectories.")
sub.add_argument("--runs-txt", type=Path, required=True,
help="The txt file with all the run ids in it.")
sub.add_argument("--target", type=Path, required=True,
help="Target file where to put the combine overlaps to.")

sub = mode_parsers.add_parser("download")
sub.add_argument("--json-file", type=Path, required=True,
help="The big JSON file with all the run infos.")
sub.add_argument("--root", type=dir_path, required=True,
help="Root directory for all output subdirectories.")

sub = mode_parsers.add_parser("make-stats")
sub.add_argument("--input", type=Path, required=True,
help="Input JSON file with the run info.")
sub.add_argument("--output", type=Path, required=True,
help="Output stats file.")

sub = mode_parsers.add_parser("stitch-contigs")
sub.add_argument("--info-file", type=Path, required=True,
help="Input JSON file with the run info.")
sub.add_argument("--output", type=Path, required=True,
help="Output file.")

sub = mode_parsers.add_parser("aggregate-runs-stats")
sub.add_argument("--input", type=Path, required=True,
help="Input CSV file with the run stats.")
sub.add_argument("--output", type=Path, required=True,
help="Output stats file.")

sub = mode_parsers.add_parser("aggregate-runs-overlaps")
sub.add_argument("--input", type=Path, required=True,
help="Input CSV file with the run overlaps.")
sub.add_argument("--output", type=Path, required=True,
help="Output stats file.")

sub = mode_parsers.add_parser("make-properties")
sub.add_argument("--input", type=Path, required=True,
help="Input TOML file with the apps properties.")
sub.add_argument("--output", type=Path, required=True,
help="Output CSV file.")

sub = mode_parsers.add_parser("extract-run-ids")
sub.add_argument("--input", type=Path, required=True,
help="Input JSON file with the run info.")
sub.add_argument("--output", type=Path, required=True,
help="Output ids file.")

sub = mode_parsers.add_parser("join-tables")
sub.add_argument("--inputs", type=Path, nargs=argparse.ONE_OR_MORE,
help="Input CSV files to union.")
sub.add_argument("--column", type=str, required=True,
help="The column that will serve as the index.")
sub.add_argument("--output", type=Path, required=True,
help="Output CSV file.")

verbosity_group = parser.add_mutually_exclusive_group()
verbosity_group.add_argument('--verbose', action='store_true', help='Increase output verbosity.')
verbosity_group.add_argument('--no-verbose', action='store_true', help='Normal output verbosity.', default=True)
verbosity_group.add_argument('--debug', action='store_true', help='Maximum output verbosity.')
verbosity_group.add_argument('--quiet', action='store_true', help='Minimize output verbosity.')

return parser


def parse_args(argv: Sequence[str]) -> argparse.Namespace:
parser = cli_parser()
return parser.parse_args(argv)


def main_typed(subcommand: str, args: argparse.Namespace) -> None:
if args.subcommand == 'all':
run_all(batches_list=args.batches_list, root=args.root, properties=args.properties)
elif args.subcommand == 'get-batch':
get_batch(batch=args.batch, target=args.target)
elif args.subcommand == 'combine-batches-runs':
combine_batches_runs(batches=args.batches, target=args.target)
elif args.subcommand == 'combine-runs-stats':
combine_runs_stats(root=args.root, runs_txt=args.runs_txt, target=args.target)
elif args.subcommand == 'combine-runs-overlaps':
combine_runs_overlaps(root=args.root, runs_txt=args.runs_txt, target=args.target)
elif args.subcommand == 'download':
download(json_file=args.json_file, root=args.root)
elif args.subcommand == 'make-stats':
make_stats(input=args.input, output=args.output)
elif args.subcommand == 'stitch-contigs':
stitch_contigs(info_file=args.info_file, output=args.output)
elif args.subcommand == 'extract-run-ids':
extract_run_ids(input=args.input, output=args.output)
elif args.subcommand == 'aggregate-runs-stats':
aggregate_runs_stats(input=args.input, output=args.output)
elif args.subcommand == 'aggregate-runs-overlaps':
aggregate_runs_overlaps(input=args.input, output=args.output)
elif args.subcommand == 'make-properties':
make_properties(input=args.input, output=args.output)
elif args.subcommand == 'join-tables':
join_tables(inputs=args.inputs, column=args.column, output=args.output)
else:
raise UserError("Unrecognized subcommand %r.", args.subcommand)


def main(argv: Sequence[str]) -> int:
args = parse_args(argv)
subcommand: str = args.subcommand

if args.quiet:
logger.setLevel(logging.ERROR)
elif args.verbose:
logger.setLevel(logging.INFO)
elif args.debug:
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.WARN)

try:
main_typed(subcommand, args)
logger.debug("Done.")
return 0
except BrokenPipeError:
logger.debug("Broken pipe.")
return 1
except KeyboardInterrupt:
logger.debug("Interrupted.")
return 1
except UserError as e:
logger.fatal(e.fmt, *e.fmt_args)
return e.code


def entry() -> None:
sys.exit(main(sys.argv[1:]))


if __name__ == '__main__': entry() # noqa
4 changes: 4 additions & 0 deletions micall/utils/analyze_kive_batches/batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

from typing import NewType

BatchName = NewType("BatchName", str)
18 changes: 18 additions & 0 deletions micall/utils/analyze_kive_batches/combine_batches_runs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

from typing import Iterable
from pathlib import Path
import json


def combine_batches_runs(batches: Iterable[Path], target: Path) -> None:
result = []

for output in batches:
with output.open() as reader:
batches_json = json.load(reader)
for batch_json in batches_json:
runs = batch_json["runs"]
result.extend(runs)

with target.open("w") as writer:
json.dump(result, writer, indent='\t')
39 changes: 39 additions & 0 deletions micall/utils/analyze_kive_batches/combine_runs_overlaps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@

from pathlib import Path
import json
import csv

from micall.utils.dir_path import DirPath
from micall.utils.new_atomic_file import new_atomic_text_file
from .logger import logger


FIELDNAMES = ("app",
"overlap_size",
"overlap_mismatches",
"overlap_pvalue",
"run_id",
"sample",
)


def combine_runs_overlaps(root: DirPath, runs_txt: Path, target: Path) -> None:
run_ids = runs_txt.read_text().splitlines()
with new_atomic_text_file(target) as writer:
dwriter = csv.DictWriter(writer, fieldnames=FIELDNAMES)
dwriter.writeheader()
for run_id in run_ids:
stats = root / "runs" / str(run_id) / "stats.json"
if not stats.exists():
logger.debug("No stats file for run %s.", run_id)
continue

with open(stats) as stats_reader:
stats_object = json.load(stats_reader)

overlaps = stats_object.get("overlaps", [])
for overlap in overlaps:
overlap["app"] = stats_object["app"]
overlap["run_id"] = stats_object["run_id"]
overlap["sample"] = stats_object["sample"]
dwriter.writerow(overlap)
41 changes: 41 additions & 0 deletions micall/utils/analyze_kive_batches/combine_runs_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@

from pathlib import Path
import json
import csv

from micall.utils.dir_path import DirPath
from micall.utils.new_atomic_file import new_atomic_text_file
from .logger import logger


FIELDNAMES = ("app",
"concordance",
"depth",
"mlen",
"total_mlen",
"overlap_count",
"number_of_contigs",
"avg_contigs_size",
"run_time",
"run_id",
"sample",
)


def combine_runs_stats(root: DirPath, runs_txt: Path, target: Path) -> None:
run_ids = runs_txt.read_text().splitlines()
with new_atomic_text_file(target) as writer:
dwriter = csv.DictWriter(writer, fieldnames=FIELDNAMES)
dwriter.writeheader()
for run_id in run_ids:
stats = root / "runs" / str(run_id) / "stats.json"
if not stats.exists():
logger.debug("No stats file for run %s.", run_id)
continue

with open(stats) as stats_reader:
stats_object = json.load(stats_reader)

selected = {key: value for key, value in stats_object.items()
if key in FIELDNAMES}
dwriter.writerow(selected)
Loading