diff --git a/README.md b/README.md index 59f4753..75b43e1 100644 --- a/README.md +++ b/README.md @@ -25,11 +25,18 @@ Clair3-Nova is the 2nd generation of [Clair3-Trio](https://github.com/HKU-BAL/Cl + [Option 1. Docker pre-built image](#option-1-docker-pre-built-image) + [Option 2. Build an anaconda virtual environment](#option-2-build-an-anaconda-virtual-environment) + [Option 3. Docker Dockerfile](#option-3-docker-dockerfile) +* [General Usage](#general-usage) +* [Options](#options) * [Output Files](#output-files) * [How to get high-quality de novo variants from the output](#how-to-get-high-quality-de-novo-variants-from-the-output) +* [Model Training](docs/trio/nova_training.md) ---- ## Latest Updates +*v0.3 (June 23, 2024)*: add r10.4.1 hac model and add base_err feature +1. add r10 HAC model trained at HG002 trio +2. add `--base_err` [flag](https://github.com/HKU-BAL/Clair3/issues/220) for reducing "./." in gvcf output +3. add `--keep_iupac_bases` [flag](https://en.wikipedia.org/wiki/International_Union_of_Pure_and_Applied_Chemistry) to showing iupac char. *v0.2.1 (May 15, 2024)*: fix bugs of SelectCandidates_Trio. @@ -37,7 +44,6 @@ Clair3-Nova is the 2nd generation of [Clair3-Trio](https://github.com/HKU-BAL/Cl *v0.1 (Feb 6, 2024)*: Initial release. - ---- ## Pre-trained Models @@ -47,6 +53,7 @@ Download models from [here](http://www.bio8.cs.hku.hk/clair3_trio/clair3_nova_mo | Model name | Platform | Training samples | Date | Basecaller | File | Link | | :----------------------------: | :---------: | :----------------------------------------------------------: | -------------------------------- | :--------------------------: | ----------------| :-------------------: | | r1041_e82_400bps_sup_nova | ONT r10.4.1 E8.2 (5kHz) | HG002,3,4 | 20240206 | Dorado v4.0.0 SUP | r1041_e82_400bps_sup_nova.tar.gz | [Download](http://www.bio8.cs.hku.hk/clair3_trio/clair3_nova_models/r1041_e82_400bps_sup_nova.tar.gz) | +| r1041_e82_400bps_hac_nova | ONT r10.4.1 E8.2 (5kHz) | HG002,3,4 | 20240723 | Dorado v4.0.0 HAC | r1041_e82_400bps_hac_nova.tar.gz | [Download](http://www.bio8.cs.hku.hk/clair3_trio/clair3_nova_models/r1041_e82_400bps_hac_nova.tar.gz) | | r941_prom_sup_g5014_nova | ONT r9.4.1 | HG002,3,4 | 20240330 | Guppy5 sup| r941_prom_sup_g5014_nova.tar.gz | [Download](http://www.bio8.cs.hku.hk/clair3_trio/clair3_nova_models/r941_prom_sup_g5014_nova.tar.gz) | @@ -58,13 +65,11 @@ When using the Clair3-Nova model, please use a corresponding Clair3 model for Pi | Model name | Platform | Training samples | Date | Basecaller | File | Link | | :----------------------------: | :---------: | :----------------------------------------------------------: | -------------------------------- | :--------------------------: | ----------------------------------- | :----------------------------------------------------------: | | r1041_e82_400bps_sup_v430 | ONT r10.4.1 E8.2 (5kHz) | - | - | Dorado v4.3.0 SUP | r1041_e82_400bps_sup_v430.tar.gz | [Download](http://www.bio8.cs.hku.hk/clair3_trio/clair3_models/r1041_e82_400bps_sup_v430.tar.gz) | +| r1041_e82_400bps_hac_v430 | ONT r10.4.1 E8.2 (5kHz) | - | - | Dorado v4.3.0 HAC | r1041_e82_400bps_hac_v430.tar.gz | [Download](http://www.bio8.cs.hku.hk/clair3_trio/clair3_models/r1041_e82_400bps_hac_v430.tar.gz) | | r941_prom_sup_g5014 | ONT r9.4.1 | - | - | Guppyu5 sup | r941_prom_sup_g5014.tar.gz | [Download](http://www.bio8.cs.hku.hk/clair3_trio/clair3_models/r941_prom_sup_g5014.tar.gz) | - - ---- - ## Installation ### Option 1. Docker pre-built image @@ -81,7 +86,6 @@ THREADS="[MAXIMUM_THREADS]" # e.g. 8 MODEL_C3="[Clair3 MODEL NAME]" # e.g. Clair3 model, e.g. r1041_e82_400bps_sup_v430 MODEL_C3D="[Clair3-Trio MODEL NAME]" # e.g. Clair3-Nova model, r1041_e82_400bps_sup_nova - docker run -it \ -v ${INPUT_DIR}:${INPUT_DIR} \ -v ${OUTPUT_DIR}:${OUTPUT_DIR} \ @@ -147,13 +151,11 @@ mkdir -p models/clair3_models wget http://www.bio8.cs.hku.hk/clair3_trio/clair3_models/clair3_models.tar.gz tar -zxvf clair3_models.tar.gz -C ./models/clair3_models - # download Clair3-Nova's pre-trained models mkdir -p models/clair3_nova_models wget http://www.bio8.cs.hku.hk/clair3_trio/clair3_nova_models/clair3_nova_models.tar.gz tar -zxvf clair3_nova_models.tar.gz -C ./models/clair3_nova_models - # run clair3-nova _INPUT_DIR="[YOUR_INPUT_FOLDER]" # e.g. ./input _BAM_C=${_INPUT_DIR}/input_child.bam # chnage your child's bam file name here @@ -199,6 +201,85 @@ docker build -f ./Dockerfile -t hkubal/clair3-nova:latest . docker run -it hkubal/clair3-nova:latest /opt/bin/run_clair3_nova.sh --help ``` +### General Usage + +**Caution**: Use `=value` for optional parameters, e.g. `--bed_fn=fn.bed` instead of `--bed_fn fn.bed`. + +```bash + +./run_clair3_nova.sh \ + --bam_fn_c=${_BAM_C} \ + --bam_fn_p1=${_BAM_P1} \ + --bam_fn_p2=${_BAM_P2} \ + --output=${_OUTPUT_DIR} \ + --ref_fn=${_REF} \ + --threads=${_THREADS} \ + --model_path_clair3="${_MODEL_DIR_C3}" \ + --model_path_clair3_nova="${_MODEL_DIR_C3D}" \ + --sample_name_c=${_SAMPLE_C} \ + --sample_name_p1=${_SAMPLE_P1} \ + --sample_name_p2=${_SAMPLE_P2} + +``` + +### Options + +**Required parameters:** + +```bash +--bam_fn_c=FILE BAM file input, for child. The input file must be samtools indexed. +--bam_fn_p1=FILE BAM file input, for parent1. The input file must be samtools indexed. +--bam_fn_p2=FILE BAM file input, for parent1. The input file must be samtools indexed. +--ref_fn=FILE FASTA reference file input. The input file must be samtools indexed. +--model_path_clair3=STR The folder path containing a Clair3 model (requiring six files in the folder, including pileup.data-00000-of-00002, pileup.data-00001-of-00002 pileup.index, full_alignment.data-0000 +0-of-00002, full_alignment.data-00001-of-00002 and full_alignment.index). +--model_path_clair3_nova=STR The folder path containing a Clair3-Nova model (files structure same as Clair3). +-t, --threads=INT Max #threads to be used. The full genome will be divided into small chunks for parallel processing. Each chunk will use 4 threads. The #chunks being processed simultaneously is ceil +(#threads/4)*3. 3 is the overloading factor. +-o, --output=PATH VCF/GVCF output directory. + +``` + +**Other parameters:** + +**Caution**: Use `=value` for optional parameters, e.g., `--bed_fn=fn.bed` instead of `--bed_fn fn.bed` + +```bash +-v, --version Check Clair3-Nova version +-h, --help Check Clair3-Nova help page +--bed_fn=FILE Call variants only in the provided bed regions. +--vcf_fn=FILE Candidate sites VCF file input, variants will only be called at the sites in the VCF file if provided. +--ctg_name=STR The name of the sequence to be processed. +--pileup_only Use the pileup model only when calling, default: disable. +--pileup_phasing Use the pileup model calling and phasing, default: disable. +--sample_name_c=STR Define the sample name for Child to be shown in the VCF file.[Child] +--sample_name_p1=STR Define the sample name for Parent1 to be shown in the VCF file.[Parent1] +--sample_name_p2=STR Define the sample name for Parent2 to be shown in the VCF file.[Parent2] +--gvcf Enable GVCF output, default: disable. +--qual=INT If set, variants with >=$qual will be marked PASS, or LowQual otherwise. +--samtools=STR Path of samtools, samtools version >= 1.10 is required. +--python=STR Path of python, python3 >= 3.6 is required. +--pypy=STR Path of pypy3, pypy3 >= 3.6 is required. +--parallel=STR Path of parallel, parallel >= 20191122 is required. +--whatshap=STR Path of whatshap, whatshap >= 1.0 is required. +--chunk_size=INT The size of each chuck for parallel processing, default: 5000000. +--print_ref_calls Show reference calls (0/0) in VCF file, default: disable. +--include_all_ctgs Call variants on all contigs, otherwise call in chr{1..22,X,Y} and {1..22,X,Y}, default: disable. +--snp_min_af=FLOAT Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08. +--indel_min_af=FLOAT Minimum Indel AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15. +--nova_model_prefix=STR Model prefix in trio & nova calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: nova. +--enable_output_phasing Output phased variants using whatshap, default: disable. +--enable_output_haplotagging Output enable_output_haplotagging BAM variants using whatshap, default: disable. +--enable_phasing It means `--enable_output_phasing`. The option is retained for backward compatibility. +--var_pct_full=FLOAT EXPERIMENTAL: Specify an expected percentage of low quality 0/1 and 1/1 variants called in the pileup mode for full-alignment mode calling, default: 0.3. +--ref_pct_full=FLOAT EXPERIMENTAL: Specify an expected percentage of low quality 0/0 variants called in the pileup mode for full-alignment mode calling, default: 0.1 . +--var_pct_phasing=FLOAT EXPERIMENTAL: Specify an expected percentage of high quality 0/1 variants used in WhatsHap phasing, default: 0.8 for ont guppy5 and 0.7 for other platforms. +--pileup_model_prefix=STR EXPERIMENTAL: Model prefix in pileup calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index. default: pileup. +--keep_iupac_bases EXPERIMENTAL: Keep IUPAC reference and alternate bases, default: convert all IUPAC bases to N. +--base_err=FLOAT EXPERIMENTAL: Estimated base error rate when enabling gvcf option, default: 0.05 (set smaller will increase reduce the number of ./. output). +--gq_bin_size=INT EXPERIMENTAL: Default gq bin size for merge non-variant block when enabling gvcf option, default: 5. + +``` ## Output Files diff --git a/clair3/CallVarBam.py b/clair3/CallVarBam.py index 35b79a1..d578abb 100644 --- a/clair3/CallVarBam.py +++ b/clair3/CallVarBam.py @@ -117,6 +117,7 @@ def Run(args): fast_mode = CommandOption('fast_mode', args.fast_mode) call_snp_only_mode = CommandOption('call_snp_only', args.call_snp_only) enable_long_indel_mode = CommandOption('enable_long_indel', args.enable_long_indel) + keep_iupac_bases_mode = CommandOption('keep_iupac_bases', args.keep_iupac_bases) ctgStart = None ctgEnd = None @@ -222,7 +223,8 @@ def Run(args): chunk_id, chunk_num, gvcf_mode, - enable_long_indel_mode + enable_long_indel_mode, + keep_iupac_bases_mode, ] try: @@ -371,6 +373,9 @@ def main(): parser.add_argument('--enable_long_indel', type=str2bool, default=False, help="EXPERIMENTAL: Enable long Indel variants(>50 bp) calling") + parser.add_argument('--keep_iupac_bases', type=str2bool, default=False, + help="EXPERIMENTAL: Keep IUPAC (non ACGTN) reference and alternate bases, default: convert all IUPAC bases to N") + # options for debug purpose parser.add_argument('--phasing_info_in_bam', action='store_true', help="DEBUG: Skip phasing and use the phasing info provided in the input BAM (HP tag), default: False") diff --git a/clair3/CallVariants.py b/clair3/CallVariants.py index ce8c92c..7f497fd 100644 --- a/clair3/CallVariants.py +++ b/clair3/CallVariants.py @@ -18,7 +18,8 @@ ) import clair3.utils as utils from clair3.task.genotype import Genotype, genotype_string_from, genotype_enum_from, genotype_enum_for_task -from shared.utils import IUPAC_base_to_ACGT_base_dict as BASE2ACGT, BASIC_BASES, str2bool, file_path_from, log_error, log_warning +from shared.utils import IUPAC_base_to_ACGT_base_dict as BASE2ACGT, BASIC_BASES, str2bool, file_path_from, \ + log_error, log_warning, convert_iupac_to_n from clair3.task.variant_length import VariantLength os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" logging.basicConfig(format='%(message)s', level=logging.INFO) @@ -39,7 +40,8 @@ 'gvcf', 'pileup', 'enable_long_indel', - 'maximum_variant_length_that_need_infer' + 'maximum_variant_length_that_need_infer', + 'keep_iupac_bases' ]) OutputUtilities = namedtuple('OutputUtilities', [ 'print_debug_message', @@ -199,7 +201,8 @@ def Run(args): gvcf=args.gvcf, pileup=args.pileup, enable_long_indel=args.enable_long_indel, - maximum_variant_length_that_need_infer=maximum_variant_length_that_need_infer + maximum_variant_length_that_need_infer=maximum_variant_length_that_need_infer, + keep_iupac_bases=args.keep_iupac_bases ) output_utilities = output_utilties_from( sample_name=args.sampleName, @@ -1306,6 +1309,10 @@ def decode_alt_info(alt_info_dict): is_reference=is_reference ) + if not output_config.keep_iupac_bases: + reference_base = convert_iupac_to_n(reference_base) + alternate_base = convert_iupac_to_n(alternate_base) + if output_config.is_debug: output_utilities.print_debug_message( chromosome, @@ -1810,6 +1817,9 @@ def main(): parser.add_argument('--enable_long_indel', type=str2bool, default=False, help="EXPERIMENTAL: Enable long Indel variants(>50 bp) calling") + parser.add_argument('--keep_iupac_bases', type=str2bool, default=False, + help="EXPERIMENTAL: Keep IUPAC (non ACGTN) reference and alternate bases, default: convert all IUPAC bases to N") + # options for debug purpose parser.add_argument('--use_gpu', type=str2bool, default=False, help="DEBUG: Use GPU for calling. Speed up is mostly insignficiant. Only use this for building your own pipeline") diff --git a/docs/pileup_training.md b/docs/pileup_training.md new file mode 100644 index 0000000..b99cbe1 --- /dev/null +++ b/docs/pileup_training.md @@ -0,0 +1,329 @@ +# Train a model for Clair3 pileup calling + +This document shows how to train and fine-tune a deep-learning model for Clair3 pileup calling. For training a model for full-alignment calling, please check [here](full_alignment_training.md). Clair3 needs both a pileup model and a full-alignment model to work. Compared to [Clair](https://github.com/HKU-BAL/Clair/blob/master/docs/TRAIN.md), the training workflow of Clair3 is simplified. The training materials are grouped according to sample, coverage, and chromosome. The groups are converted into tensor binaries. The binaries are much space-efficient and easier to process. As required, multiples tensor binaries can be used together for model training and fine-tuning. + +![image](images/clair3_data_generation.png) + +--- + +## Prerequisites + +- Clair3 installed +- GNU Parallel installed +- Sufficient hard-disk space +- Truth VCF file after representation unification (check [here](https://github.com/HKU-BAL/Clair3/blob/main/docs/representation_unification.md) on how to generate unified VCF) +- A high-end GPU (have tested in RTX Titan, RTX 2080Ti, and GTX 1080Ti) + +--- + +## Contents + +* [I. Training data subsamping](#i-training-data-subsamping-recommended) +* [II. Build compressed binary files](#ii-build-compressed-binary-files-for-pileup-model-training) + - [1. Setup variables](#1-setup-variables) + - [2. Create working directories ](#2-create-temporary-working-folders-for-each-submodule) + - [3. Split and extend bed regions](#3-split-bed-regions-using-the-splitextendbed-submodule) + - [4. Get truth variants from unified VCF file](#4-get-truth-variants-from-unified-vcf-using-the-gettruth-submodule) + - [5. Create pileup tensor](#5-create-pileup-tensor-using-the-createtrainingtensor-submodule) + - [6. Merge compressed binaries](#6-merge-compressed-binaries-using-the-mergebin-submodule) +* [III. Model training](#iii-model-training) + - [1. Pileup model training](#1-pileup-model-training) + - [2. Pileup model fine-tune using pre-trained model](#2-pileup-model-fine-tune-using-pre-trained-model-optional) + +--- + +## I. Training data subsamping + +To build a training dataset with multiple coverages, we need to create multiple subsampled BAM files from the original full-depth BAM file. + +```bash +# Please make sure the provided bam files are sorted and indexed +ALL_BAM_FILE_PATH=( +'hg002.bam' +'hg002.bam' +'hg002.bam' +) + +# Each line represents one input sample name +ALL_SAMPLE=( +'hg002' +'hg002' +'hg002' +) + +# Each line represents subsample ratio to each sample +## FRAC values for 'samtools view -s INT.FRAC' +## please refer to samtools' documentation for further information +## we set 90%, 60% and 30% of the full coverage as example +DEPTHS=( +900 +600 +300) + +# Output folder to store all subsampled BAM +SUBSAMPLED_BAMS_FOLDER_PATH="[SUBSAMPLED_BAMS_FOLDER_PATH]" +mkdir -p ${SUBSAMPLED_BAMS_FOLDER_PATH} + +# Other parameters +THREADS=8 +THREADS_LOW=$((${THREADS}*3/4)) +if [[ ${THREADS_LOW} < 1 ]]; then THREADS_LOW=1; fi +PARALLEL='parallel' +SAMTOOLS='samtools' + +# Subsample BAM to specific depths in parallel +${PARALLEL} -j${THREADS} "${SAMTOOLS} view -@12 -s {2}.{2} -b -o ${SUBSAMPLED_BAMS_FOLDER_PATH}/{2}_{1}.bam {3}" ::: ${ALL_SAMPLE[@]} :::+ ${DEPTHS[@]} :::+ ${ALL_BAM_FILE_PATH[@]} +${PARALLEL} -j${THREADS} "${SAMTOOLS} index ${SUBSAMPLED_BAMS_FOLDER_PATH}/{2}_{1}.bam" ::: ${ALL_SAMPLE[@]} :::+ ${DEPTHS[@]} + +# Add symbolic links for full coverage BAM and bai index +${PARALLEL} "ln -s {2} ${SUBSAMPLED_BAMS_FOLDER_PATH}/{1}_1000.bam" ::: ${ALL_SAMPLE[@]} :::+ ${ALL_BAM_FILE_PATH[@]} +${PARALLEL} "ln -s {2}.bai ${SUBSAMPLED_BAMS_FOLDER_PATH}/{1}_1000.bam.bai" ::: ${ALL_SAMPLE[@]} :::+ ${ALL_BAM_FILE_PATH[@]} + +``` + +---- + +## II. Build compressed binary files for pileup model training + +> - The whole procedure are breaking into blocks for better readability and error-tracing. +> - For each `parallel` command that run with the `--joblog` option, we can check the `Exitval` column from the job log output. If the column contains a non-zero value, it means error occurred; please rerun the failed block again. +> - We suggest using absolute path EVERYWHERE. +> - You can use a Truth VCF file without representation unification. You might want to do it only for testing because Clair3's performance would be significantly affected without representation unification. + +This section shows how to build multiple compressed tensor binary files for multiple samples either with or without multiple coverages. + +#### 1. Setup variables +```bash +# Setup executable variables +CLAIR3="clair3.py" # clair3.py +PYPY="[PYPY_BIN_PATH]" # e.g. pypy3 +PYTHON3="[PYTHON3_BIN_PATH]" # e.g. python3 +PARALLEL="[PARALLEL_BIN_PATH]" # e.g. parallel +SAMTOOLS="[SAMTOOLS_BIN_PATH]" # e.g. samtools + +# Input parameters +PLATFORM="[SEQUENCING_PLATFORM]" # e.g. {ont, hifi, ilmn} +UNIFIED_VCF_FILE_PATH="[YOUR_VCF_FILE_PATH_ARRAY]" # e.g. hg002.unified.vcf.gz +ALL_BAM_FILE_PATH="[YOUR_BAM_FILE_PATH_ARRAY]" # e.g. hg002.bam +DEPTHS="[YOUR_DEPTHS_OF_SAMPLES_ARRAY]" # e.g. 1000 (means no subsample) +ALL_REFERENCE_FILE_PATH="[YOUR_FASTA_FILE_PATH_ARRAY]" # e.g. hg002.fasta +ALL_BED_FILE_PATH="[YOUR_BED_FILE_PATH_ARRAY]" # e.g. hg002.bed +ALL_SAMPLE="[YOUR_SAMPLE_NAME_ARRAY]" # e.g. hg002 +OUTPUT_DIR="[YOUR_OUTPUT_FOLDER_PATH]" # e.g. output_folder + +# Each line represent one input BAM with a matched coverage in the "DEPTH" array +## check the "Training data subsamping" section on how to apply BAM subsampling +ALL_BAM_FILE_PATH=( +'hg002_1000.bam' +'hg002_800.bam' +'hg004_1000.bam' +) + +# Each line represents subsample ration to each sample, 1000 if no subsampling applies +DEPTHS=( +1000 +800 +1000 +) + +# Each line represents one input sample name +ALL_SAMPLE=( +'hg002' +'hg002' +'hg004' +) + +# Each line represents the reference file of each sample +ALL_REFERENCE_FILE_PATH=( +'GRch38.fa' +'GRch38.fa' +'GRch38.fa' +) + +# Each line represents one BED region file for each sample +ALL_BED_FILE_PATH=( +'hg002.bed' +'hg002.bed' +'hg004.bed' +) + +# Each line represents one representation-unified VCF file for each sample +UNIFIED_VCF_FILE_PATH=( +'hg002_1000.unified.vcf.gz' +'hg002_800.unified.vcf.gz' +'hg004_1000.unified.vcf.gz' +) + +# Chromosome prefix ("chr" if chromosome names have the "chr" prefix) +CHR_PREFIX="chr" + +# array of chromosomes (do not include tge "chr" prefix) to train in all sample +## pls note that in the pretrained Clair3 models, we have excluded chr20 as a hold-out set. +CHR=(21 22) + +# Number of threads to be used +THREADS=8 + +# Number of chucks to be divided into for parallel processing +chunk_num=15 +CHUNK_LIST=`seq 1 ${chunk_num}` + +# The number of chucks to be divided for bin file generation for parallel processing +bin_chunk_num=1 +BIN_CHUNK_LIST=`seq 1 ${bin_chunk_num}` + +# Minimum SNP and INDEL AF required for a candidate variant +MIN_SNP_AF=0.08 +MIN_INDEL_AF=0.15 + +# Maximum non-variant ratio for pileup model training, for pileup model training, we use variant:non-variant = 1:5 +MAXIMUM_NON_VARIANT_RATIO=5 + +``` + +#### 2. Create temporary working folders for each submodule +```bash +# Temporary working directories +DATASET_FOLDER_PATH="${OUTPUT_DIR}/build" +TENSOR_CANDIDATE_PATH="${DATASET_FOLDER_PATH}/tensor_can" +BINS_FOLDER_PATH="${DATASET_FOLDER_PATH}/bins" +SPLIT_BED_PATH="${DATASET_FOLDER_PATH}/split_beds" +VAR_OUTPUT_PATH="${DATASET_FOLDER_PATH}/var" + +mkdir -p ${DATASET_FOLDER_PATH} +mkdir -p ${TENSOR_CANDIDATE_PATH} +mkdir -p ${BINS_FOLDER_PATH} +mkdir -p ${SPLIT_BED_PATH} +mkdir -p ${VAR_OUTPUT_PATH} + +``` + +#### 3. Split and extend bed regions using the `SplitExtendBed` submodule +```bash +cd ${OUTPUT_DIR} + +# Split BED file regions according to the contig names and extend the bed regions +${PARALLEL} --joblog ${DATASET_FOLDER_PATH}/split_extend_bed.log -j${THREADS} \ +"${PYPY} ${CLAIR3} SplitExtendBed \ + --bed_fn {4} \ + --output_fn ${SPLIT_BED_PATH}/{2}_{3}_{1} \ + --ctgName ${CHR_PREFIX}{1}" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} :::+ ${DEPTHS[@]} :::+ ${ALL_BED_FILE_PATH[@]} + +``` + +#### 4. Get truth variants from unified VCF using the `GetTruth` submodule + +```bash +${PARALLEL} --joblog ${VAR_OUTPUT_PATH}/get_truth.log -j${THREADS} \ +"${PYPY} ${CLAIR3} GetTruth \ + --vcf_fn {4} \ + --ctgName ${CHR_PREFIX}{1} \ + --var_fn ${VAR_OUTPUT_PATH}/var_{2}_{3}_{1}" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} :::+ ${DEPTHS[@]} :::+ ${UNIFIED_VCF_FILE_PATH[@]} + +``` + +#### 5. Create pileup tensor using the `CreateTrainingTensor` submodule + +```bash +# Create pileup tensor for model training +${PARALLEL} --joblog ${DATASET_FOLDER_PATH}/create_tensor_pileup.log -j${THREADS_LOW} \ +"${PYPY} ${CLAIR3} CreateTrainingTensor \ + --bam_fn {4} \ + --ref_fn {5} \ + --var_fn ${VAR_OUTPUT_PATH}/var_{2}_{3}_{1} \ + --bin_fn ${TENSOR_CANDIDATE_PATH}/tensor_{2}_{3}_{1}_{7} \ + --ctgName ${CHR_PREFIX}{1} \ + --samtools ${SAMTOOLS} \ + --snp_min_af ${MIN_SNP_AF} \ + --indel_min_af ${MIN_INDEL_AF} \ + --extend_bed ${SPLIT_BED_PATH}/{2}_{3}_{1} \ + --bed_fn {6} \ + --pileup \ + --platform ${PLATFORM} \ + --shuffle \ + --maximum_non_variant_ratio ${MAXIMUM_NON_VARIANT_RATIO} \ + --chunk_id {7} \ + --chunk_num ${chunk_num}" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} :::+ ${DEPTHS[@]} :::+ ${ALL_BAM_FILE_PATH[@]} :::+ ${ALL_REFERENCE_FILE_PATH[@]} :::+ ${ALL_BED_FILE_PATH[@]} ::: ${CHUNK_LIST[@]} |& tee ${DATASET_FOLDER_PATH}/CTP.log + +``` + +**Options** + + - `--allow_duplicate_chr_pos` : for multiple coverages training, this option is required to to allow different coverage training samples at the same variant site. + - `--shuffle` : as the input tensors are created in the order of starting positions, this option shuffles the training data in each chunk. During the training process, we also apply index reshuffling in each epoch. + - `--maximum_non_variant_ratio` : we set a maximum non-variant ratio (variant:non-variant = 1:5) for pileup model training, non-variants are randomly selected from the candidate set if the ratio is exceeded, or all non-variants will be used for training otherwise. + - `--max_depth` : set the depth cap of every genome position. Pileup input summarizes position-level read alignments where depth information varies in the training materials. If not contrained by computational resources, we recommend setting the depth cap to the maximum depth coverage of the training materials. + +#### 6. Merge compressed binaries using the `MergeBin` submodule + +```bash +# Merge compressed binaries +${PARALLEL} --joblog ${DATASET_FOLDER_PATH}/mergeBin.log -j${THREADS} \ +"${PYTHON3} ${CLAIR3} MergeBin \ + ${TENSOR_CANDIDATE_PATH}/tensor_{2}_{3}_{1}_* \ + --out_fn ${BINS_FOLDER_PATH}/bin_{2}_{3}_{1} \ + --pileup" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} :::+ ${DEPTHS[@]} + +``` + +---- + +## III. Model training + +We provide two optional training mode: + +​ **Option1**: Train pileup model using new dataset, in this mode, we will use randomly initialized model weights and train the model until reaches max epochs(30) or converge. + +​ **Option2**: Fine-tune pileup model using pre-trained parameters and choose a smaller learning rate for better converge in new dataset. + +***We recommend using the fine-tune mode (option 2) for better robustness.*** + +#### 1. Pileup model training (option 1) + +```bash +MODEL_FOLDER_PATH="${OUTPUT_DIR}/train" +mkdir -p ${MODEL_FOLDER_PATH} + +cd ${MODEL_FOLDER_PATH} + +# A single GPU is used for model training +export CUDA_VISIBLE_DEVICES="0" +${PYTHON3} ${CLAIR3} Train \ + --bin_fn ${BINS_FOLDER_PATH} \ + --ochk_prefix ${MODEL_FOLDER_PATH}/pileup \ + --pileup \ + --add_indel_length False \ + --random_validation \ + --platform ${PLATFORM} + +``` + +**Options** + + - `--pileup` : enable pileup model training mode. (enable full-alignment mode if the option is not set). + - `--add_indel_length` : enable or disable the two indel-length tasks. In the pre-trained models, the two tasks are disabled in pileup calling. + - `--random_validation`: randomly holdout 10% from all candidate sites as validation data, the best-performing epoch on the validation data are selected as our pre-trained models. + +#### 2. Pileup model fine-tune using pre-trained model (option 2) + +```bash +# Pileup model fine-tuning using a new sample +MODEL_FOLDER_PATH="${OUTPUT_DIR}/train" +mkdir -p ${MODEL_FOLDER_PATH} + +cd ${MODEL_FOLDER_PATH} + +export CUDA_VISIBLE_DEVICES="0" +${PYTHON3} ${CLAIR3} Train \ + --pileup \ + --bin_fn ${BINS_FOLDER_PATH} \ + --ochk_prefix ${MODEL_FOLDER_PATH}/pileup \ + --add_indel_length False \ + --random_validation \ + --platform ${PLATFORM} \ + --learning_rate 0.0001 \ + --chkpnt_fn "[YOUR_PRETRAINED_MODEL]" ## use pre-trained pileup model here + +``` + +We experimentally offer pileup model fine tuning using a pre-trained Clair3 pileup model, by using a smaller `learning_rate` and a pre-trained model `chkpnt_fn`. We recommend starting with a smaller learning rate such as `1e-4` to fine-tune a pre-trained pileup model. diff --git a/docs/trio/1_run_RU_HG002.sh b/docs/trio/1_run_RU_HG002.sh new file mode 100644 index 0000000..cdef863 --- /dev/null +++ b/docs/trio/1_run_RU_HG002.sh @@ -0,0 +1,151 @@ +source /autofs/bal36/jhsu/trio/script/data_config_trio.sh + +# ** please update the following path +REFERENCE_FILE_PATH="${REF_FILE_PATH}" + +# Input parameters +VCF_FILE_PATH="${CHILD_VCF_FILE_PATH}" +BED_FILE_PATH="${CHILD_BED_FILE_PATH}" +BAM_FILE_PATH=/autofs/bal36/jhsu/fast5/r1041/hac/HG002/hg002.pass.cram +OUTPUT_DIR="/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG002" +# ** end + + + + +# Chromosome prefix ("chr" if chromosome names have the "chr" prefix) +CHR_PREFIX="chr" + +# array of chromosomes (do not include "chr"-prefix) +CHR=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22) + +# Number of threads to be used +THREADS=24 + +# The number of chucks to be divided into for parallel processing +chunk_num=15 +CHUNK_LIST=`seq 1 ${chunk_num}` + +# Minimum AF required for a candidate variant +MIN_AF=0.08 + +# Temporary working directory +SPLIT_BED_PATH="${OUTPUT_DIR}/split_beds" +VCF_OUTPUT_PATH="${OUTPUT_DIR}/vcf_output" +VAR_OUTPUT_PATH="${OUTPUT_DIR}/var" +PHASE_VCF_PATH="${OUTPUT_DIR}/phased_vcf" +PHASE_BAM_PATH="${OUTPUT_DIR}/phased_bam" + +mkdir -p ${SPLIT_BED_PATH} +mkdir -p ${VCF_OUTPUT_PATH} +mkdir -p ${VAR_OUTPUT_PATH} +mkdir -p ${PHASE_VCF_PATH} +mkdir -p ${PHASE_BAM_PATH} + + + + +cd ${OUTPUT_DIR} + +# WhatsHap phasing vcf file if vcf file includes '|' in INFO tag +${WHATSHAP} unphase ${VCF_FILE_PATH} > ${OUTPUT_DIR}/INPUT.vcf.gz + +# WhatsHap phase vcf file +${PARALLEL} --joblog ${PHASE_VCF_PATH}/phase.log -j${THREADS} \ +"${WHATSHAP} phase \ + --output ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \ + --reference ${REFERENCE_FILE_PATH} \ + --chromosome ${CHR_PREFIX}{1} \ + --ignore-read-groups \ + --distrust-genotypes \ + ${OUTPUT_DIR}/INPUT.vcf.gz \ + ${BAM_FILE_PATH}" ::: ${CHR[@]} + +# Index phased vcf file +${PARALLEL} -j ${THREADS} tabix -p vcf ${PHASE_VCF_PATH}/phased_{1}.vcf.gz ::: ${CHR[@]} + + +# WhatsHap haplotags bam file +${PARALLEL} --joblog ${PHASE_BAM_PATH}/haplotag.log -j${THREADS} \ +"${WHATSHAP} haplotag \ + --output ${PHASE_BAM_PATH}/{1}.bam \ + --reference ${REFERENCE_FILE_PATH} \ + --regions ${CHR_PREFIX}{1} \ + --ignore-read-groups \ + ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \ + ${BAM_FILE_PATH}" ::: ${CHR[@]} + +# Index the phased bam file using samtools +${PARALLEL} --joblog ${PHASE_BAM_PATH}/index.log -j ${THREADS} ${SAMTOOLS} index -@12 ${PHASE_BAM_PATH}/{1}.bam ::: ${CHR[@]} + + + +# Split bed file regions according to the contig name and extend bed region +${PARALLEL} --joblog ${SPLIT_BED_PATH}/split_extend_bed.log -j${THREADS} \ +"${PYPY} ${CLAIR3} SplitExtendBed \ + --bed_fn ${BED_FILE_PATH} \ + --output_fn ${SPLIT_BED_PATH}/{1} \ + --ctgName ${CHR_PREFIX}{1}" ::: ${CHR[@]} + +#Get true variant label information from VCF file +${PARALLEL} --joblog ${VAR_OUTPUT_PATH}/get_truth.log -j${THREADS} \ +"${PYPY} ${CLAIR3} GetTruth \ + --vcf_fn ${PHASE_VCF_PATH}/phased_{1}.vcf.gz \ + --ctgName ${CHR_PREFIX}{1} \ + --var_fn ${VAR_OUTPUT_PATH}/var_{1}" ::: ${CHR[@]} + +CLAIR3=/autofs/bal36/jhsu/trio/Clair3-Nova_dev/clair3.py +${PARALLEL} --joblog ${OUTPUT_DIR}/unify_repre.log -j${THREADS} \ +"${PYPY} ${CLAIR3} UnifyRepresentation \ + --bam_fn ${PHASE_BAM_PATH}/{1}.bam \ + --var_fn ${VAR_OUTPUT_PATH}/var_{1} \ + --ref_fn ${REFERENCE_FILE_PATH} \ + --bed_fn ${BED_FILE_PATH} \ + --extend_bed ${SPLIT_BED_PATH}/{1} \ + --output_vcf_fn ${VCF_OUTPUT_PATH}/vcf_{1}_{2} \ + --min_af ${MIN_AF} \ + --chunk_id {2} \ + --chunk_num ${chunk_num} \ + --platform ${PLATFORM} \ + --ctgName ${CHR_PREFIX}{1}" ::: ${CHR[@]} ::: ${CHUNK_LIST[@]} > ${OUTPUT_DIR}/RU.log + + +cat ${VCF_OUTPUT_PATH}/vcf_* | ${PYPY} ${CLAIR3} SortVcf --output_fn ${OUTPUT_DIR}/unified.vcf +bgzip -f ${OUTPUT_DIR}/unified.vcf +tabix -f -p vcf ${OUTPUT_DIR}/unified.vcf.gz + + +RU_VAR_OUTPUT_PATH="${OUTPUT_DIR}/var_ru" +mkdir -p ${RU_VAR_OUTPUT_PATH} + +#Get true variant label information from VCF file +${PARALLEL} --joblog ${RU_VAR_OUTPUT_PATH}/get_truth.log -j${THREADS} \ +"${PYPY} ${CLAIR3} GetTruth \ +--vcf_fn ${OUTPUT_DIR}/unified.vcf.gz \ +--ctgName ${CHR_PREFIX}{1} \ +--var_fn ${RU_VAR_OUTPUT_PATH}/var_{1}" ::: ${CHR[@]} + + + +${SAMTOOLS} merge -@48 ${OUTPUT_DIR}/merged.bam ${PHASE_BAM_PATH}/*.bam +${SAMTOOLS} index -@48 ${OUTPUT_DIR}/merged.bam + +docker run \ + --mount type=bind,source=${VCF_FILE_PATH},target=/true_vcf \ + --mount type=bind,source=${BED_FILE_PATH},target=/true_bed \ + -v "${OUTPUT_DIR}/happy":"/happy" \ + -v `dirname ${REF_FILE_PATH}`:"/ref" \ + -v "${OUTPUT_DIR}":"/input" \ + pkrusche/hap.py /opt/hap.py/bin/hap.py \ + /true_vcf \ + /input/unified.vcf.gz \ + -f /true_bed \ + -r /ref/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa \ + -o /happy/output \ + --engine=vcfeval \ + --pass-only \ + --threads=8 + + +python3 /autofs/bal31/jhsu/home/projects/clair3_t/scripts/happy.py --happy_vcf_fn ${OUTPUT_DIR}/happy/output.vcf.gz +# will get around 0.999 F1-score if run correctly. diff --git a/docs/trio/2_gen_downsample_phased_bam_p.sh b/docs/trio/2_gen_downsample_phased_bam_p.sh new file mode 100644 index 0000000..ac0072e --- /dev/null +++ b/docs/trio/2_gen_downsample_phased_bam_p.sh @@ -0,0 +1,62 @@ +tar_dir="/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/" +#_SAMPLE_N='HG002' + +# example input +# $DEPTHS_N=(10 30 60) +# $DEPTHS=(156 469 938) +# bash 2_gen_downsample_phased_bam_p.sh HG004 "${DEPTHS_N[*]}" "${DEPTHS[*]}" +# or bash 2_gen_downsample_phased_bam_p.sh HG004 "10 30 60" "156 469 938" + +# for coverage 71x +# $DEPTHS_N=(10 30 60) +# $DEPTHS=(140 421 842) + +# for coverage 80.84x +# $DEPTHS_N=(10 30 60) +# $DEPTHS=(124 371 742) + +_SAMPLE_N="$1" +DEPTHS_N=($2) +DEPTHS=($3) + +echo ${_SAMPLE_N} ${DEPTHS_N[@]} ${DEPTHS[@]} + +# Other parameters +THREADS=8 +PARALLEL='parallel' +SAMTOOLS='samtools' + +SUBSAMPLED_BAMS_FOLDER_PATH="/autofs/bal36/jhsu/trio/r1041_hac/2_bam/${_SAMPLE_N}/" +mkdir -p ${SUBSAMPLED_BAMS_FOLDER_PATH} + +echo ${_SAMPLE_N} +BAM_FILE_PATH=${tar_dir}/${_SAMPLE_N}/merged.bam + +cd ${tar_dir}/${_SAMPLE_N} + +# setting depth to downsample and their FRAC code in samtools view -s +# for example if you have data coverage of 64x, you can downsample your data to 10x, 30x, 60x +# via following code +# DEPTHS_N=(10 30 60) +# DEPTHS=(156 469 938) + +# note 1, you can check your data coverage via mosdepth`: +# mosdepth -t 5 -n -x --quantize 0:15:150: ${_SAMPLE_N}_phased ${BAM_FILE_PATH} +# note 2, the samtools view -s code can be genrate via following python script: +# ```python +#tar_d = "10 30 60".split() +#max_d = 64 +#' '.join([['%.3f' % (float(i) / max_d)][0][2:] for i in tar_d]) + + +# Subsample BAM to specific depths in parallel +${PARALLEL} -j${THREADS} "${SAMTOOLS} view -@12 -s 42.{1} -b -o ${SUBSAMPLED_BAMS_FOLDER_PATH}/${_SAMPLE_N}_{2}.bam ${BAM_FILE_PATH}" ::: ${DEPTHS[@]} :::+ ${DEPTHS_N[@]} +${PARALLEL} -j${THREADS} "${SAMTOOLS} index ${SUBSAMPLED_BAMS_FOLDER_PATH}/${_SAMPLE_N}_{1}.bam" ::: ${DEPTHS_N[@]} + +cp ${BAM_FILE_PATH} ${SUBSAMPLED_BAMS_FOLDER_PATH}/${_SAMPLE_N}_80.bam +${SAMTOOLS} index ${SUBSAMPLED_BAMS_FOLDER_PATH}/${_SAMPLE_N}_80.bam +## +###DEPTHS_N=(10 20 30 40 50 60 70 80) +#cd ${SUBSAMPLED_BAMS_FOLDER_PATH} +${PARALLEL} -j ${THREADS} "mosdepth -t 5 -n -x --quantize 0:15:150: ${_SAMPLE_N}_{1} ${SUBSAMPLED_BAMS_FOLDER_PATH}/${_SAMPLE_N}_{1}.bam " ::: ${DEPTHS_N[@]} + diff --git a/docs/trio/3_generate_downsample_pileup.sh b/docs/trio/3_generate_downsample_pileup.sh index 98a17c5..a597967 100644 --- a/docs/trio/3_generate_downsample_pileup.sh +++ b/docs/trio/3_generate_downsample_pileup.sh @@ -1,4 +1,4 @@ -# using Clair3 to generating pileup calling on each downsampled sample +source /autofs/bal36/jhsu/trio/script/data_config_trio.sh PARALLEL=parallel PYPY=pypy @@ -7,18 +7,18 @@ PYTHON3=python3 PLATFORM="ont" # Clair3 folder -_ORI_CLAIR3="XXX" +_ORI_CLAIR3="/autofs/bal36/jhsu/trio/Clair3-Nova" # note the use right models for your training # check https://github.com/HKU-BAL/Clair3#pre-trained-models -_MODEL_DIR="XXXX/r941_prom_sup_g5014/" +#_MODEL_DIR="/autofs/bal31/jhsu/home/data/clair3_models/r941_prom_sup_g5014/" +_MODEL_DIR="/autofs/bal36/jhsu/trio/models/rerio/clair3_models/r1041_e82_400bps_hac_v430/" C3_THREADS=36 # Clair3 threads number -# Clair3-Trio's clair3.py path -CLAIR3_TRIO="XXX/clair3.py" +# Clair3-Trio's path +CLAIR3_TRIO="/autofs/bal36/jhsu/trio/Clair3-Nova_dev/clair3.py" -# output folder for pileup files -DATASET_FOLDER_PATH="XXX" +DATASET_FOLDER_PATH="/autofs/bal36/jhsu/trio/r1041_hac" # creating working folder PILEUP_OUTPUT_PATH="${DATASET_FOLDER_PATH}/3_pileup" @@ -29,35 +29,31 @@ cd ${PILEUP_OUTPUT_PATH} # input files and parameters # training chrosome name, and prefix -# excluded the chr20 for testing -CHR=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) # please update this part for your dataset +CHR=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) CHR_PREFIX="chr" -CHILD_SAMPLE_N="HG002" #please update this part for your dataset +# GH (tb update to github) +CHILD_SAMPLE_N="HG002" P1_SAMPLE_N="HG003" P2_SAMPLE_N="HG004" -# bam file from 2_generate_downsample_phased_bam -# please update this part for your dataset # sample name -# if you have four downsampled dataset, you need to secific 4 * 3 sample name in this setting. ALL_SAMPLE=( ${CHILD_SAMPLE_N} ${P1_SAMPLE_N} -${P2_SAMPLE_N} +${P2_SAMPLE_N} ${CHILD_SAMPLE_N} ${P1_SAMPLE_N} -${P2_SAMPLE_N} +${P2_SAMPLE_N} ${CHILD_SAMPLE_N} ${P1_SAMPLE_N} -${P2_SAMPLE_N} +${P2_SAMPLE_N} ${CHILD_SAMPLE_N} ${P1_SAMPLE_N} -${P2_SAMPLE_N} +${P2_SAMPLE_N} ) -# list number same as in $ALL_SAMPLE DEPTHS=( # data coverage 10 10 @@ -73,35 +69,32 @@ DEPTHS=( # data coverage 80 ) -# list number same as in $ALL_SAMPLE -# phased bam files, from the 2_generate_downsample_phased_bam.sh ALL_PHASED_BAM_FILE_PATH=( -"XXX/bam/phased/HG002/HG002_10.bam" -"XXX/bam/phased/HG003/HG003_10.bam" -"XXX/bam/phased/HG004/HG004_10.bam" -"XXX/bam/phased/HG002/HG002_30.bam" -"XXX/bam/phased/HG003/HG003_30.bam" -"XXX/bam/phased/HG004/HG004_30.bam" -"XXX/bam/phased/HG002/HG002_60.bam" -"XXX/bam/phased/HG003/HG003_60.bam" -"XXX/bam/phased/HG004/HG004_60.bam" -"XXX/bam/phased/HG002/HG002_80.bam" -"XXX/bam/phased/HG003/HG003_80.bam" -"XXX/bam/phased/HG004/HG004_80.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG002/HG002_10.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG003/HG003_10.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG004/HG004_10.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG002/HG002_30.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG003/HG003_30.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG004/HG004_30.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG002/HG002_60.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG003/HG003_60.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG004/HG004_60.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG002/HG002_80.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG003/HG003_80.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG004/HG004_80.bam" ) -# please update this part for your dataset -# HOME_DIR="/autofs/bal31/jhsu/home" -# REF_FILE_PATH="${HOME_DIR}/data/reference/grch38_no_alt_plus_hs38d1/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna" -# merged trio's bed file using the 0_gerneate_trio_bed.sh -# _TRIO_BED_PATH="${HOME_DIR}/data/giab/020304.bed" +#HOME_DIR="/autofs/bal31/jhsu/home" +#REF_FILE_PATH="${HOME_DIR}/data/reference/grch38_no_alt_plus_hs38d1/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna" +#CHILD_BED_FILE_PATH="${HOME_DIR}/data/giab/${CHILD_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +#P1_BED_FILE_PATH="${HOME_DIR}/data/giab/${P1_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +#P2_BED_FILE_PATH="${HOME_DIR}/data/giab/${P2_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" + +# GH +# merge trio's bed file using the gerneate_trio_bed.sh +#_TRIO_BED_PATH="${HOME_DIR}/data/giab/020304.bed" -# your reference file -REF_FILE_PATH="XXXX/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna" -# your trio bed file, from 0_generate_trio_bed.sh -_TRIO_BED_PATH="XXXX/data/giab/020304.bed" -# list number same as in $ALL_SAMPLE ALL_REFERENCE_FILE_PATH=( "${REF_FILE_PATH}" "${REF_FILE_PATH}" @@ -118,7 +111,6 @@ ALL_REFERENCE_FILE_PATH=( ) -# list number same as in $ALL_SAMPLE ALL_BED_FILE_PATH=( ${_TRIO_BED_PATH} ${_TRIO_BED_PATH} diff --git a/docs/trio/4_1_downsample_bin.sh b/docs/trio/4_1_downsample_bin.sh index 875f3b5..2a78dc2 100644 --- a/docs/trio/4_1_downsample_bin.sh +++ b/docs/trio/4_1_downsample_bin.sh @@ -1,17 +1,29 @@ -# this script is used to downsample the generated bin files for model training -# this script is optional, but is recommand for efficient model training -# we downsample the generated bin files by selecting partial bin files into merged_bins folder, -# which will be used for model training. +BASE_DIR="/autofs/bal36/jhsu/trio/r1041_hac/4_build_tensors/build/" - -# original bin dataset -ori_dir="/autofs/bal36/jhsu/r10/output/4_build_tensors/build/ALL_1356/bins/" - -# downsampled bin dataset -new_dir="XXXXXX/output/4_build_tensors/build/merged_bins" +new_dir="${BASE_DIR}/merged_bins_dn" +# mkdir -p ${new_dir} - -_dep="10_10_10" +###rm ${new_dir}/* +## +ori_dir="${BASE_DIR}/ALL_1368_denovo/denovo_t_bins/" +#new_dir="/autofs/bal36/jhsu/trio/output/4_build_tensors/build/merged_bins_wg_nm_dn" +_dep=10 +_chr=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) +parallel "ln -s ${ori_dir}/HG003_TRIO_${_dep}_{1}_{2} ${new_dir}/HG003_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 1 10` +_dep=80 +_chr=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) +parallel "ln -s ${ori_dir}/HG003_TRIO_${_dep}_{1}_{2} ${new_dir}/HG003_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 1 10` +_dep=30 +_chr=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) +parallel "ln -s ${ori_dir}/HG003_TRIO_${_dep}_{1}_{2} ${new_dir}/HG003_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 1 10` +_dep=60 +_chr=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) +parallel "ln -s ${ori_dir}/HG003_TRIO_${_dep}_{1}_{2} ${new_dir}/HG003_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 1 10` + + + +ori_dir="${BASE_DIR}/ALL_1368/denovo_t_bins/" +_dep=10 _chr=(1 2 3 4 5 6 7 8 9) parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` @@ -22,8 +34,7 @@ _chr=(16 17 18 19 21 22) parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 3 3 10` - -_dep="30_30_30" +_dep=80 _chr=(10 11 12 13 14 15) parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` @@ -35,7 +46,7 @@ parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_d -_dep="60_60_60" +_dep=30 _chr=(10 11 12 13 14 15) parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` @@ -47,33 +58,62 @@ parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_d -_dep="80_80_80" +_dep=60 _chr=(16 17 18 7 8 9) parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` +_chr=(1 2 3 4 5 6 19 21 22) +parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 2 3 10` +_chr=(10 11 12 13 14 15) +parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 3 3 10` -_dep="30_10_10" -_chr=(1 2 3 4 5 6 7 8 9) + + +ori_dir="${BASE_DIR}/SUB_1368/denovo_t_bins/" + +#new_dir="/autofs/bal36/jhsu/trio/output/4_build_tensors/build/merged_bins" + + +_dep=SUB_31 +_chr=(7 10 13 16 19 21 22) parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` -_dep="60_10_10" +#_chr=(10 11 12 13 14 15) +#parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 2 3 10` +# +#_chr=(16 17 18 19 21 22) +#parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 3 3 10` + + + +_dep=SUB_61 _chr=(10 11 12 13 14 15) +_chr=(1 4 7 10 13 21 22) parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` -_dep="80_10_10" -_chr=(10 11 12) -parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` +#_chr=(16 17 18 19 21 22) +#parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 2 3 10` +# +#_chr=(1 2 3 4 5 6 7 8 9) +#parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 3 3 10` + + + +_dep=SUB_81 +#_chr=(10 11 12 13 14 15) +#parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` + +#_chr=(1 2 3 4 5 6 7 8 9) +#parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 2 3 10` -_dep="60_30_30" _chr=(16 17 18 19 21 22) +_chr=(1 4 7 10 19 21 22) parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 3 3 10` -_dep="80_30_30" -_chr=(1 2 3 4) -parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 2 3 10` -_dep="80_60_60" -_chr=(16 17 18) -parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 2 3 10` +_dep=SUB_13 +_chr=(16 17 18 7 8 9) +_chr=(1 10 13 16 19 21 22) +parallel "ln -s ${ori_dir}/HG002_TRIO_${_dep}_{1}_{2} ${new_dir}/HG002_TRIO_${_dep}_{1}_{2}" ::: ${_chr[@]} ::: `seq 1 3 10` diff --git a/docs/trio/4_create_tensors.sh b/docs/trio/4_create_tensors.sh index 9abfd8b..2d57a55 100644 --- a/docs/trio/4_create_tensors.sh +++ b/docs/trio/4_create_tensors.sh @@ -1,17 +1,4 @@ -# creating tensors for model training -# each tensors are generated from combination of trio samples -# e.g. in the following example, we generation different combination from [10x, 30x, 60x, 80x] data from HG002 trio into -# combination of (Child, Parent1, Parent2): -# (10x, 10x, 10x) #even trio coverage case -# (30x, 30x, 30x) #even trio coverage case -# (60x, 60x, 60x) #even trio coverage case -# (80x, 80x, 80x) #even trio coverage case -# (30x, 10x, 10x) #uneven trio coverage case -# (60x, 10x, 10x) #uneven trio coverage case -# (80x, 10x, 10x) #uneven trio coverage case -# (60x, 30x, 30x) #uneven trio coverage case -# (80x, 30x, 30x) #uneven trio coverage case -# (80x, 60x, 60x) #uneven trio coverage case +source /autofs/bal36/jhsu/trio/script/data_config_trio.sh PARALLEL=parallel PYPY=pypy @@ -22,11 +9,10 @@ THREADS=8 # threads number # Clair3-Trio's path -CLAIR3_TRIO="XXX/clair3.py" +CLAIR3_TRIO="/autofs/bal36/jhsu/trio/Clair3-Nova/clair3.py" # creating working folder -TRAIN_FOLDER_PREFIX="XXX" -# name for the tensors folder +TRAIN_FOLDER_PREFIX="/autofs/bal36/jhsu/trio/r1041_hac/4_build_tensors" BUILD_N="ALL_1368" # data building data, e.g. "HG002_all" # Temporary working directories @@ -59,15 +45,15 @@ _LOG_SUF="" # log file suffix # input files and parameters # training chrosome name, and prefix -# excluded the chr20 for testing CHR=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) CHR_PREFIX="chr" +# GH (tb update to github) CHILD_SAMPLE_N="HG002" P1_SAMPLE_N="HG003" P2_SAMPLE_N="HG004" -# sample name, need to specific for # of combination (10 in thie scipt) * 3 times +# sample name ALL_SAMPLE=( ${CHILD_SAMPLE_N} # your child sample name ${P1_SAMPLE_N} # your parenet-1 sample name @@ -81,86 +67,36 @@ ${P2_SAMPLE_N} ${CHILD_SAMPLE_N} ${P1_SAMPLE_N} ${P2_SAMPLE_N} -${CHILD_SAMPLE_N} -${P1_SAMPLE_N} -${P2_SAMPLE_N} -${CHILD_SAMPLE_N} -${P1_SAMPLE_N} -${P2_SAMPLE_N} -${CHILD_SAMPLE_N} -${P1_SAMPLE_N} -${P2_SAMPLE_N} -${CHILD_SAMPLE_N} -${P1_SAMPLE_N} -${P2_SAMPLE_N} -${CHILD_SAMPLE_N} -${P1_SAMPLE_N} -${P2_SAMPLE_N} -${CHILD_SAMPLE_N} -${P1_SAMPLE_N} -${P2_SAMPLE_N} ) TRIO_N="${CHILD_SAMPLE_N}_TRIO" # your trio name, e.g. HG002_TRIO #echo ${ALL_SAMPLE[@]} -# (10x, 10x, 10x) #even trio coverage case -# (30x, 30x, 30x) #even trio coverage case -# (60x, 60x, 60x) #even trio coverage case -# (80x, 80x, 80x) #even trio coverage case -# (30x, 10x, 10x) #uneven trio coverage case -# (60x, 10x, 10x) #uneven trio coverage case -# (80x, 10x, 10x) #uneven trio coverage case -# (60x, 30x, 30x) #uneven trio coverage case -# (80x, 30x, 30x) #uneven trio coverage case -# (80x, 60x, 60x) #uneven trio coverage case -# data coverage combination name for each individual sample -# list number same as in $ALL_SAMPLE DEPTHS=( # data coverage -"10_10_10" -"10_10_10" -"10_10_10" -"30_30_30" -"30_30_30" -"30_30_30" -"60_60_60" -"60_60_60" -"60_60_60" -"80_80_80" -"80_80_80" -"80_80_80" -"30_10_10" -"30_10_10" -"30_10_10" -"60_10_10" -"60_10_10" -"60_10_10" -"80_10_10" -"80_10_10" -"80_10_10" -"60_30_30" -"60_30_30" -"60_30_30" -"80_30_30" -"80_30_30" -"80_30_30" -"80_60_60" -"80_60_60" -"80_60_60" +10 +10 +10 +30 +30 +30 +60 +60 +60 +80 +80 +80 ) +#HOME_DIR="/autofs/bal31/jhsu/home" +#REF_FILE_PATH="${HOME_DIR}/data/reference/grch38_no_alt_plus_hs38d1/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna" +#CHILD_BED_FILE_PATH="${HOME_DIR}/data/giab/${CHILD_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +#P1_BED_FILE_PATH="${HOME_DIR}/data/giab/${P1_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +#P2_BED_FILE_PATH="${HOME_DIR}/data/giab/${P2_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +# +## GH +## merge trio's bed file using the gerneate_trio_bed.sh +#_TRIO_BED_PATH="${HOME_DIR}/data/giab/020304.bed" -# please update this part for your dataset -HOME_DIR="XXX" -REF_FILE_PATH="${HOME_DIR}/data/reference/grch38_no_alt_plus_hs38d1/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna" -CHILD_BED_FILE_PATH="${HOME_DIR}/data/giab/${CHILD_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -P1_BED_FILE_PATH="${HOME_DIR}/data/giab/${P1_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -P2_BED_FILE_PATH="${HOME_DIR}/data/giab/${P2_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" - -# merge trio's bed file using the gerneate_trio_bed.sh -_TRIO_BED_PATH="${HOME_DIR}/data/giab/020304.bed" - -# list number same as in $ALL_SAMPLE ALL_REFERENCE_FILE_PATH=( "${REF_FILE_PATH}" "${REF_FILE_PATH}" @@ -174,28 +110,9 @@ ALL_REFERENCE_FILE_PATH=( "${REF_FILE_PATH}" "${REF_FILE_PATH}" "${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" ) -# list number same as in $ALL_SAMPLE ALL_BED_FILE_PATH=( ${_TRIO_BED_PATH} ${_TRIO_BED_PATH} @@ -209,24 +126,6 @@ ${_TRIO_BED_PATH} ${_TRIO_BED_PATH} ${_TRIO_BED_PATH} ${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} ) @@ -236,121 +135,53 @@ ${_TRIO_BED_PATH} # check the representation_unification_trio.md page for more information # for practical concerns, the representation_unification_trio.md require only run once on the highest depth for each sample, while the low coverage can be sampled from the highest coverage data, i.e. merged.bam in the representation_unification folder -# files obtained from 3_generate_downsample_pileup.sh -# need to be corespind to $DEPTHS -# (10x, 10x, 10x) #even trio coverage case -# (30x, 30x, 30x) #even trio coverage case -# (60x, 60x, 60x) #even trio coverage case -# (80x, 80x, 80x) #even trio coverage case -# (30x, 10x, 10x) #uneven trio coverage case -# (60x, 10x, 10x) #uneven trio coverage case -# (80x, 10x, 10x) #uneven trio coverage case -# (60x, 30x, 30x) #uneven trio coverage case -# (80x, 30x, 30x) #uneven trio coverage case -# (80x, 60x, 60x) #uneven trio coverage case -# data coverage combination name for each individual sample ALL_PILEUP_VCF_FILE_PATH=( -"XXX/HG002_10/pileup.vcf.gz" -"XXX/HG003_10/pileup.vcf.gz" -"XXX/HG004_10/pileup.vcf.gz" -"XXX/HG002_30/pileup.vcf.gz" -"XXX/HG003_30/pileup.vcf.gz" -"XXX/HG004_30/pileup.vcf.gz" -"XXX/HG002_60/pileup.vcf.gz" -"XXX/HG003_60/pileup.vcf.gz" -"XXX/HG004_60/pileup.vcf.gz" -"XXX/HG002_80/pileup.vcf.gz" -"XXX/HG003_80/pileup.vcf.gz" -"XXX/HG004_80/pileup.vcf.gz" -"XXX/HG002_30/pileup.vcf.gz" -"XXX/HG003_10/pileup.vcf.gz" -"XXX/HG004_10/pileup.vcf.gz" -"XXX/HG002_60/pileup.vcf.gz" -"XXX/HG003_10/pileup.vcf.gz" -"XXX/HG004_10/pileup.vcf.gz" -"XXX/HG002_80/pileup.vcf.gz" -"XXX/HG003_10/pileup.vcf.gz" -"XXX/HG004_10/pileup.vcf.gz" -"XXX/HG002_60/pileup.vcf.gz" -"XXX/HG003_30/pileup.vcf.gz" -"XXX/HG004_30/pileup.vcf.gz" -"XXX/HG002_80/pileup.vcf.gz" -"XXX/HG003_30/pileup.vcf.gz" -"XXX/HG004_30/pileup.vcf.gz" -"XXX/HG003_80/pileup.vcf.gz" -"XXX/HG004_60/pileup.vcf.gz" -"XXX/HG004_60/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG002_10/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG003_10/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG004_10/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG002_30/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG003_30/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG004_30/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG002_60/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG003_60/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG004_60/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG002_80/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG003_80/pileup.vcf.gz" +"/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/HG004_80/pileup.vcf.gz" ) -# files folder obtained from 1_run_RU.sh ALL_RU_FILE_PATH=( -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" -"XXX/HG002/" -"XXX/HG003/" -"XXX/HG004/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG002/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG003/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG004/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG002/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG003/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG004/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG002/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG003/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG004/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG002/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG003/" +"/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG004/" ) -# files obtained from 2_genreate_downsample_phased_bam.sh ALL_PHASED_BAM_FILE_PATH=( -"XXX/phased/HG002/HG002_10.bam" -"XXX/phased/HG003/HG003_10.bam" -"XXX/phased/HG004/HG004_10.bam" -"XXX/phased/HG002/HG002_30.bam" -"XXX/phased/HG003/HG003_30.bam" -"XXX/phased/HG004/HG004_30.bam" -"XXX/phased/HG002/HG002_60.bam" -"XXX/phased/HG003/HG003_60.bam" -"XXX/phased/HG004/HG004_60.bam" -"XXX/phased/HG002/HG002_80.bam" -"XXX/phased/HG003/HG003_80.bam" -"XXX/phased/HG004/HG004_80.bam" -"XXX/phased/HG002/HG002_30.bam" -"XXX/phased/HG003/HG003_10.bam" -"XXX/phased/HG004/HG004_10.bam" -"XXX/phased/HG002/HG002_60.bam" -"XXX/phased/HG003/HG003_10.bam" -"XXX/phased/HG004/HG004_10.bam" -"XXX/phased/HG002/HG002_80.bam" -"XXX/phased/HG003/HG003_10.bam" -"XXX/phased/HG004/HG004_10.bam" -"XXX/phased/HG002/HG002_60.bam" -"XXX/phased/HG003/HG003_30.bam" -"XXX/phased/HG004/HG004_30.bam" -"XXX/phased/HG002/HG002_80.bam" -"XXX/phased/HG003/HG003_30.bam" -"XXX/phased/HG004/HG004_30.bam" -"XXX/phased/HG002/HG002_80.bam" -"XXX/phased/HG003/HG003_60.bam" -"XXX/phased/HG004/HG004_60.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG002/HG002_10.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG003/HG003_10.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG004/HG004_10.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG002/HG002_30.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG003/HG003_30.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG004/HG004_30.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG002/HG002_60.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG003/HG003_60.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG004/HG004_60.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG002/HG002_80.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG003/HG003_80.bam" +"/autofs/bal36/jhsu/trio/r1041_hac/2_bam/HG004/HG004_80.bam" ) +# GH # set up array for create tensors input INPUT_PILEUP_VCF_C=() INPUT_PILEUP_VCF_P1=() @@ -417,6 +248,7 @@ time ${PARALLEL} --joblog ${LOG_PATH}/S_fiter_hete_snp_pileup${_LOG_SUF}.log -j$ --bed ${_TRIO_BED_PATH} \ --ctgName ${CHR_PREFIX}{1}" ::: ${CHR[@]} ::: ${INPUT_PILEUP_VCF_C[@]} :::+ ${INPUT_PILEUP_VCF_P1[@]} :::+ ${INPUT_PILEUP_VCF_P2[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} :::+ ${DEPTH_S[@]} |& tee ${LOG_PATH}/FHSP${_LOG_SUF}.log + echo "[INFO] Create Tensors" time ${PARALLEL} --joblog ${LOG_PATH}/S_create_tensor${_LOG_SUF}.log -j${THREADS} \ "${PYPY} ${CLAIR3_TRIO} CreateTensorFullAlignment \ @@ -445,20 +277,41 @@ time ${PARALLEL} --joblog ${LOG_PATH}/S_merge_tensors${_LOG_SUF}.log -j${THREADS " ::: ${CHR[@]} ::: ${CHUNK_LIST[@]} ::: ${DEPTH_S[@]} |& tee ${LOG_PATH}/MT${_LOG_SUF}.log +#IF_CHECK_MCV=1 # whether filter MCV in training data +# +#time ${PARALLEL} --joblog ${LOG_PATH}/S_tensor2Bin${_LOG_SUF}.log -j${THREADS} \ +#"${PYTHON3} ${CLAIR3_TRIO} Tensor2Bin_Trio \ +#--tensor_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${TRIO_N}_{3}_{1} \ +#--var_fn_c {4}/var_ru/var_{1} \ +#--var_fn_p1 {5}/var_ru/var_{1} \ +#--var_fn_p2 {6}/var_ru/var_{1} \ +#--bin_fn ${BINS_FOLDER_PATH}/${TRIO_N}_{3}_{1}_{2} \ +#--chunk_id {2} \ +#--chunk_num ${bin_chunk_num} \ +#--platform ${PLATFORM} \ +#--allow_duplicate_chr_pos \ +#--maximum_non_variant_ratio 1.0 \ +#--check_mcv ${IF_CHECK_MCV} \ +#--shuffle" ::: ${CHR[@]} ::: ${BIN_CHUNK_LIST[@]} ::: ${DEPTH_S[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} |& tee ${LOG_PATH}/T2B${_LOG_SUF}.log + +THREADS=32 IF_CHECK_MCV=1 # whether filter MCV in training data +DENOVO_BINS_FOLDER_PATH="${DATASET_FOLDER_PATH}/denovo_t_bins" +mkdir -p ${DENOVO_BINS_FOLDER_PATH} -time ${PARALLEL} --joblog ${LOG_PATH}/S_tensor2Bin${_LOG_SUF}.log -j${THREADS} \ -"${PYTHON3} ${CLAIR3_TRIO} Tensor2Bin_Trio \ +time ${PARALLEL} --joblog ${LOG_PATH}/S_tensor2Bin${_LOG_SUF}_dn.log -j${THREADS} \ +"${PYTHON3} ${CLAIR3_TRIO} Tensor2Bin_Trio_denovo \ --tensor_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${TRIO_N}_{3}_{1} \ --var_fn_c {4}/var_ru/var_{1} \ --var_fn_p1 {5}/var_ru/var_{1} \ --var_fn_p2 {6}/var_ru/var_{1} \ ---bin_fn ${BINS_FOLDER_PATH}/${TRIO_N}_{3}_{1}_{2} \ +--bin_fn ${DENOVO_BINS_FOLDER_PATH}/${TRIO_N}_{3}_{1}_{2} \ --chunk_id {2} \ --chunk_num ${bin_chunk_num} \ --platform ${PLATFORM} \ --allow_duplicate_chr_pos \ --maximum_non_variant_ratio 1.0 \ --check_mcv ${IF_CHECK_MCV} \ ---shuffle" ::: ${CHR[@]} ::: ${BIN_CHUNK_LIST[@]} ::: ${DEPTH_S[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} |& tee ${LOG_PATH}/T2B${_LOG_SUF}.log +--only_denovo 0 \ +--shuffle" ::: ${CHR[@]} ::: ${BIN_CHUNK_LIST[@]} ::: ${DEPTH_S[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} |& tee ${LOG_PATH}/T2B${_LOG_SUF}_dn.log diff --git a/docs/trio/4_create_tensors_denovo.sh b/docs/trio/4_create_tensors_denovo.sh new file mode 100644 index 0000000..8a638c9 --- /dev/null +++ b/docs/trio/4_create_tensors_denovo.sh @@ -0,0 +1,305 @@ +source /autofs/bal36/jhsu/trio/script/data_config_trio.sh + +PARALLEL=parallel +PYPY=pypy +SAMTOOLS=samtools +PYTHON3=python3 +PLATFORM="ont" +THREADS=8 # threads number + + +# Clair3-Trio's path +CLAIR3_TRIO="/autofs/bal36/jhsu/trio/Clair3-Nova/clair3.py" + +# creating working folder +TRAIN_FOLDER_PREFIX="/autofs/bal36/jhsu/trio/r1041_hac/4_build_tensors" +BUILD_N="ALL_1368_denovo" # data building data, e.g. "HG002_all" + +# Temporary working directories +TRAIN_FOLDER="${TRAIN_FOLDER_PREFIX}" +mkdir -p ${TRAIN_FOLDER} + +DATASET_FOLDER_PATH="${TRAIN_FOLDER}/build/${BUILD_N}" +TENSOR_CANDIDATE_FOLDER_PATH="${DATASET_FOLDER_PATH}/tensor_can" +BINS_FOLDER_PATH="${DATASET_FOLDER_PATH}/bins" +READ_FOLDER_PATH="${DATASET_FOLDER_PATH}/read_info" +INDEL_PATH="${DATASET_FOLDER_PATH}/alt_info" +SPLIT_BED_PATH="${DATASET_FOLDER_PATH}/split_beds" +PHASE_VCF_PATH="${DATASET_FOLDER_PATH}/phased_vcf" +PHASE_BAM_PATH="${DATASET_FOLDER_PATH}/phased_bam" +LOG_PATH="${DATASET_FOLDER_PATH}/log" +mkdir -p ${DATASET_FOLDER_PATH} +mkdir -p ${TENSOR_CANDIDATE_FOLDER_PATH} +mkdir -p ${BINS_FOLDER_PATH} +mkdir -p ${READ_FOLDER_PATH} +mkdir -p ${INDEL_PATH} +mkdir -p ${SPLIT_BED_PATH} +mkdir -p ${PHASE_VCF_PATH} +mkdir -p ${PHASE_BAM_PATH} +mkdir -p ${LOG_PATH} +cd ${DATASET_FOLDER_PATH} + + +# log file suffix name +_LOG_SUF="" # log file suffix + +# input files and parameters +# training chrosome name, and prefix +CHR=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) +CHR_PREFIX="chr" + +# GH (tb update to github) +CHILD_SAMPLE_N="HG002" +P1_SAMPLE_N="HG003" +P2_SAMPLE_N="HG004" + +# sample name +ALL_SAMPLE=( +${P1_SAMPLE_N} # your parenet-1 sample name +${P2_SAMPLE_N} # your parenet-2 sample name +${CHILD_SAMPLE_N} # your child sample name +${P1_SAMPLE_N} +${P2_SAMPLE_N} +${CHILD_SAMPLE_N} +${P1_SAMPLE_N} +${P2_SAMPLE_N} +${CHILD_SAMPLE_N} +${P1_SAMPLE_N} +${P2_SAMPLE_N} +${CHILD_SAMPLE_N} +) + +TRIO_N="${P1_SAMPLE_N}_TRIO" # your trio name, e.g. HG002_TRIO +#echo ${ALL_SAMPLE[@]} + +DEPTHS=( # data coverage +10 +10 +10 +30 +30 +30 +60 +60 +60 +80 +80 +80 +) + +#HOME_DIR="/autofs/bal31/jhsu/home" +#REF_FILE_PATH="${HOME_DIR}/data/reference/grch38_no_alt_plus_hs38d1/GCA_000001405.15_GRCh38_no_alt_plus_hs38d1_analysis_set.fna" +#CHILD_BED_FILE_PATH="${HOME_DIR}/data/giab/${CHILD_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +#P1_BED_FILE_PATH="${HOME_DIR}/data/giab/${P1_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +#P2_BED_FILE_PATH="${HOME_DIR}/data/giab/${P2_SAMPLE_N}_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +# +## GH +## merge trio's bed file using the gerneate_trio_bed.sh +#_TRIO_BED_PATH="${HOME_DIR}/data/giab/020304.bed" + +ALL_REFERENCE_FILE_PATH=( +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +"${REF_FILE_PATH}" +) + + +ALL_BED_FILE_PATH=( +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +) + + +# true variants set from Clair3-Trio Representation Unification +# Each line represents one representation-unified path for each input sample +# note the all path have a folder called **var_ru** +# check the representation_unification_trio.md page for more information +# for practical concerns, the representation_unification_trio.md require only run once on the highest depth for each sample, while the low coverage can be sampled from the highest coverage data, i.e. merged.bam in the representation_unification folder + +ORI_PILEUP_D="/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/" +ALL_PILEUP_VCF_FILE_PATH=( +"${ORI_PILEUP_D}/HG003_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG003_30/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_30/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_30/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG003_60/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_60/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_60/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG003_80/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_80/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_80/pileup.vcf.gz" +) + +ORI_RU_D="/autofs/bal36/jhsu/fast5/r1041/hac/1_ru" +ALL_RU_FILE_PATH=( +"${ORI_RU_D}/HG003/" +"${ORI_RU_D}/HG004/" +"${ORI_RU_D}/HG002/" +"${ORI_RU_D}/HG003/" +"${ORI_RU_D}/HG004/" +"${ORI_RU_D}/HG002/" +"${ORI_RU_D}/HG003/" +"${ORI_RU_D}/HG004/" +"${ORI_RU_D}/HG002/" +"${ORI_RU_D}/HG003/" +"${ORI_RU_D}/HG004/" +"${ORI_RU_D}/HG002/" +) + +ORI_BAM_D="/autofs/bal36/jhsu/trio/r1041_hac/2_bam" +ALL_PHASED_BAM_FILE_PATH=( +"${ORI_BAM_D}/HG003/HG003_10.bam" +"${ORI_BAM_D}/HG004/HG004_10.bam" +"${ORI_BAM_D}/HG002/HG002_10.bam" +"${ORI_BAM_D}/HG003/HG003_30.bam" +"${ORI_BAM_D}/HG004/HG004_30.bam" +"${ORI_BAM_D}/HG002/HG002_30.bam" +"${ORI_BAM_D}/HG003/HG003_60.bam" +"${ORI_BAM_D}/HG004/HG004_60.bam" +"${ORI_BAM_D}/HG002/HG002_60.bam" +"${ORI_BAM_D}/HG003/HG003_80.bam" +"${ORI_BAM_D}/HG004/HG004_80.bam" +"${ORI_BAM_D}/HG002/HG002_80.bam" +) + + +# GH +# set up array for create tensors input +INPUT_PILEUP_VCF_C=() +INPUT_PILEUP_VCF_P1=() +INPUT_PILEUP_VCF_P2=() + +TRUE_RU_FILE_C=() +TRUE_RU_FILE_P1=() +TRUE_RU_FILE_P2=() + +DEPTH_S=() + +# GH +# create trio list for candidates input +for i in $(seq 0 $((${#ALL_SAMPLE[@]}-1))) +do + + if [ $(($i % 3)) -eq 0 ]; then + INPUT_PILEUP_VCF_C+=("${ALL_PILEUP_VCF_FILE_PATH[$(($i))]}") + INPUT_PILEUP_VCF_P1+=("${ALL_PILEUP_VCF_FILE_PATH[$(($i+1))]}") + INPUT_PILEUP_VCF_P2+=("${ALL_PILEUP_VCF_FILE_PATH[$(($i+2))]}") + + TRUE_RU_FILE_C+=("${ALL_RU_FILE_PATH[$(($i))]}") + TRUE_RU_FILE_P1+=("${ALL_RU_FILE_PATH[$(($i+1))]}") + TRUE_RU_FILE_P2+=("${ALL_RU_FILE_PATH[$(($i+2))]}") + + DEPTH_S+=("${DEPTHS[$(($i))]}") + fi +done + +echo ${INPUT_PILEUP_VCF_C[@]} +echo ${INPUT_PILEUP_VCF_P1[@]} +echo ${INPUT_PILEUP_VCF_P2[@]} +echo ${TRUE_RU_FILE_C[@]} +echo ${TRUE_RU_FILE_P1[@]} +echo ${TRUE_RU_FILE_P2[@]} +echo ${DEPTH_S[@]} + +# created tensors chunk number for each chr +chunk_num=20 +CHUNK_LIST=`seq 1 ${chunk_num}` + +# create tensors bin chunk number for each chr +bin_chunk_num=10 +BIN_CHUNK_LIST=`seq 1 ${bin_chunk_num}` + + +# Select trio candidates from pileup candidates using the SelectHetSnp_Trio submodule +time ${PARALLEL} --joblog ${LOG_PATH}/S_fiter_hete_snp_pileup${_LOG_SUF}.log -j${THREADS} \ +"${PYPY} ${CLAIR3_TRIO} SelectHetSnp_Trio \ +--alt_fn_c {2} \ +--alt_fn_p1 {3} \ +--alt_fn_p2 {4} \ +--var_fn_c {5}/var_ru/var_{1} \ +--var_fn_p1 {6}/var_ru/var_{1} \ +--var_fn_p2 {7}/var_ru/var_{1} \ +--split_folder ${SPLIT_BED_PATH} \ +--sampleName ${TRIO_N} \ +--depth {8} \ +--ref_pct_full 0.2 \ +--var_pct_full 1.0 \ +--ref_var_max_ratio 1.0 \ +--for_train 1 \ +--chunk_num ${chunk_num} \ +--bed ${_TRIO_BED_PATH} \ +--ctgName ${CHR_PREFIX}{1}" ::: ${CHR[@]} ::: ${INPUT_PILEUP_VCF_C[@]} :::+ ${INPUT_PILEUP_VCF_P1[@]} :::+ ${INPUT_PILEUP_VCF_P2[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} :::+ ${DEPTH_S[@]} |& tee ${LOG_PATH}/FHSP${_LOG_SUF}.log + + +echo "[INFO] Create Tensors" +time ${PARALLEL} --joblog ${LOG_PATH}/S_create_tensor${_LOG_SUF}.log -j${THREADS} \ +"${PYPY} ${CLAIR3_TRIO} CreateTensorFullAlignment \ +--bam_fn {4} \ +--ref_fn {5} \ +--tensor_can_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_{2}_{3}_{1}_{6} \ +--indel_fn ${INDEL_PATH}/{2}_{3}_{1}_{6} \ +--ctgName ${CHR_PREFIX}{1} \ +--samtools ${SAMTOOLS} \ +--platform ${PLATFORM} \ +--full_aln_regions ${SPLIT_BED_PATH}/${TRIO_N}_{3}_{1}_{6} \ +--add_no_phasing_data_training \ +--phasing_info_in_bam" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} :::+ ${DEPTHS[@]} :::+ ${ALL_PHASED_BAM_FILE_PATH[@]} :::+ ${ALL_REFERENCE_FILE_PATH[@]} ::: ${CHUNK_LIST[@]} |& tee ${LOG_PATH}/CT${_LOG_SUF}.log + +# merge trio tensor, noted that merged no depth info +time ${PARALLEL} --joblog ${LOG_PATH}/S_merge_tensors${_LOG_SUF}.log -j${THREADS} \ +"${PYTHON3} ${CLAIR3_TRIO} Merge_Tensors_Trio \ +--tensor_fn_c ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${ALL_SAMPLE[0]}_{3}_{1}_{2} \ +--tensor_fn_p1 ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${ALL_SAMPLE[1]}_{3}_{1}_{2} \ +--tensor_fn_p2 ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${ALL_SAMPLE[2]}_{3}_{1}_{2} \ +--candidate_fn_c ${INDEL_PATH}/${ALL_SAMPLE[0]}_{3}_{1}_{2} \ +--candidate_fn_p1 ${INDEL_PATH}/${ALL_SAMPLE[1]}_{3}_{1}_{2} \ +--candidate_fn_p2 ${INDEL_PATH}/${ALL_SAMPLE[2]}_{3}_{1}_{2} \ +--tensor_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${TRIO_N}_{3}_{1}_{2} \ +--candidate_fn ${INDEL_PATH}/${TRIO_N}_{3}_{1}_{2} \ +" ::: ${CHR[@]} ::: ${CHUNK_LIST[@]} ::: ${DEPTH_S[@]} |& tee ${LOG_PATH}/MT${_LOG_SUF}.log + +IF_CHECK_MCV=1 # whether filter MCV in training data +#CHR=(1) +#CLAIR3_TRIO=/autofs/bal36/jhsu/trio/Clair3-Denovo/clair3.py +# after filter +DENOVO_BINS_FOLDER_PATH="${DATASET_FOLDER_PATH}/denovo_t_bins" +#DENOVO_BINS_FOLDER_PATH="${DATASET_FOLDER_PATH}/denovo_2t_bins" +mkdir -p ${DENOVO_BINS_FOLDER_PATH} + +time ${PARALLEL} --joblog ${LOG_PATH}/S_tensor2Bin${_LOG_SUF}.log -j${THREADS} \ +"${PYTHON3} ${CLAIR3_TRIO} Tensor2Bin_Trio_denovo \ +--tensor_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${TRIO_N}_{3}_{1} \ +--var_fn_c {4}/var_ru/var_{1} \ +--var_fn_p1 {5}/var_ru/var_{1} \ +--var_fn_p2 {6}/var_ru/var_{1} \ +--bin_fn ${DENOVO_BINS_FOLDER_PATH}/${TRIO_N}_{3}_{1}_{2} \ +--chunk_id {2} \ +--chunk_num ${bin_chunk_num} \ +--platform ${PLATFORM} \ +--allow_duplicate_chr_pos \ +--maximum_non_variant_ratio 1.0 \ +--check_mcv ${IF_CHECK_MCV} \ +--only_denovo 1 \ +--shuffle" ::: ${CHR[@]} ::: ${BIN_CHUNK_LIST[@]} ::: ${DEPTH_S[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} |& tee ${LOG_PATH}/T2B${_LOG_SUF}.log + diff --git a/docs/trio/4_create_tensors_sub.sh b/docs/trio/4_create_tensors_sub.sh index 28c4dab..e24b60d 100644 --- a/docs/trio/4_create_tensors_sub.sh +++ b/docs/trio/4_create_tensors_sub.sh @@ -1,4 +1,4 @@ -source /autofs/bal36/jhsu/r10/scripts/data_config_trio.sh +source /autofs/bal36/jhsu/trio/script/data_config_trio.sh PARALLEL=parallel PYPY=pypy @@ -9,11 +9,11 @@ THREADS=8 # threads number # Clair3-Trio's path -CLAIR3_TRIO="/autofs/bal31/jhsu/home/projects/git/clair3-trio-clean-v1/clair3.py" +CLAIR3_TRIO="/autofs/bal36/jhsu/trio/Clair3-Nova/clair3.py" # creating working folder -TRAIN_FOLDER_PREFIX="/autofs/bal36/jhsu/r10/output/4_build_tensors" -BUILD_N="SUB_1356" # data building data, e.g. "HG002_all" +TRAIN_FOLDER_PREFIX="/autofs/bal36/jhsu/trio/r1041_hac/4_build_tensors" +BUILD_N="SUB_1368" # data building data, e.g. "HG002_all" # Temporary working directories TRAIN_FOLDER="${TRAIN_FOLDER_PREFIX}" @@ -45,7 +45,8 @@ _LOG_SUF="" # log file suffix # input files and parameters # training chrosome name, and prefix -CHR=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) +#CHR=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) +CHR=(1 4 7 10 13 16 19 21 22) CHR_PREFIX="chr" # GH (tb update to github) @@ -70,9 +71,6 @@ ${P2_SAMPLE_N} ${CHILD_SAMPLE_N} ${P1_SAMPLE_N} ${P2_SAMPLE_N} -${CHILD_SAMPLE_N} -${P1_SAMPLE_N} -${P2_SAMPLE_N} ) TRIO_N="${CHILD_SAMPLE_N}_TRIO_SUB" # your trio name, e.g. HG002_TRIO @@ -82,21 +80,18 @@ DEPTHS=( # data coverage 31 31 31 -51 -51 -51 -53 -53 -53 61 61 61 -63 -63 -63 -6550 -6550 -6550 +81 +81 +81 +13 +13 +13 +18 +18 +18 ) #HOME_DIR="/autofs/bal31/jhsu/home" @@ -125,9 +120,6 @@ ALL_REFERENCE_FILE_PATH=( "${REF_FILE_PATH}" "${REF_FILE_PATH}" "${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" -"${REF_FILE_PATH}" ) @@ -147,9 +139,6 @@ ${_TRIO_BED_PATH} ${_TRIO_BED_PATH} ${_TRIO_BED_PATH} ${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} -${_TRIO_BED_PATH} ) @@ -159,67 +148,60 @@ ${_TRIO_BED_PATH} # check the representation_unification_trio.md page for more information # for practical concerns, the representation_unification_trio.md require only run once on the highest depth for each sample, while the low coverage can be sampled from the highest coverage data, i.e. merged.bam in the representation_unification folder +ORI_PILEUP_D="/autofs/bal36/jhsu/trio/r1041_hac/3_pileup/" ALL_PILEUP_VCF_FILE_PATH=( -"/autofs/bal36/jhsu/r10/output/3_pileup/HG002_30/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG003_10/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG004_10/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG002_50/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG003_10/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG004_10/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG002_50/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG003_30/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG004_30/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG002_65/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG003_10/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG004_10/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG002_65/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG003_30/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG004_30/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG002_65/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG003_50/pileup.vcf.gz" -"/autofs/bal36/jhsu/r10/output/3_pileup/HG004_50/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_30/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG003_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_60/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG003_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_80/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG003_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG003_30/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_30/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG002_10/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG003_80/pileup.vcf.gz" +"${ORI_PILEUP_D}/HG004_80/pileup.vcf.gz" ) +ORI_RU_D="/autofs/bal36/jhsu/fast5/r1041/hac/1_ru" ALL_RU_FILE_PATH=( -"/autofs/bal36/jhsu/r10/output/1_ru/HG002/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG003/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG004/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG002/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG003/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG004/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG002/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG003/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG004/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG002/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG003/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG004/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG002/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG003/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG004/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG002/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG003/" -"/autofs/bal36/jhsu/r10/output/1_ru/HG004/" +"${ORI_RU_D}/HG002/" +"${ORI_RU_D}/HG003/" +"${ORI_RU_D}/HG004/" +"${ORI_RU_D}/HG002/" +"${ORI_RU_D}/HG003/" +"${ORI_RU_D}/HG004/" +"${ORI_RU_D}/HG002/" +"${ORI_RU_D}/HG003/" +"${ORI_RU_D}/HG004/" +"${ORI_RU_D}/HG002/" +"${ORI_RU_D}/HG003/" +"${ORI_RU_D}/HG004/" +"${ORI_RU_D}/HG002/" +"${ORI_RU_D}/HG003/" ) +ORI_BAM_D="/autofs/bal36/jhsu/trio/r1041_hac/2_bam" ALL_PHASED_BAM_FILE_PATH=( -"/autofs/bal36/jhsu/r10/output/2_bam/HG002/HG002_30.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG003/HG003_10.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG004/HG004_10.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG002/HG002_50.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG003/HG003_10.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG004/HG004_10.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG002/HG002_50.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG003/HG003_30.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG004/HG004_30.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG002/HG002_65.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG003/HG003_10.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG004/HG004_10.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG002/HG002_65.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG003/HG003_30.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG004/HG004_30.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG002/HG002_65.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG003/HG003_50.bam" -"/autofs/bal36/jhsu/r10/output/2_bam/HG004/HG004_50.bam" +"${ORI_BAM_D}/HG002/HG002_30.bam" +"${ORI_BAM_D}/HG003/HG003_10.bam" +"${ORI_BAM_D}/HG004/HG004_10.bam" +"${ORI_BAM_D}/HG002/HG002_60.bam" +"${ORI_BAM_D}/HG003/HG003_10.bam" +"${ORI_BAM_D}/HG004/HG004_10.bam" +"${ORI_BAM_D}/HG002/HG002_80.bam" +"${ORI_BAM_D}/HG003/HG003_10.bam" +"${ORI_BAM_D}/HG004/HG004_10.bam" +"${ORI_BAM_D}/HG002/HG002_10.bam" +"${ORI_BAM_D}/HG003/HG003_30.bam" +"${ORI_BAM_D}/HG004/HG004_30.bam" +"${ORI_BAM_D}/HG002/HG002_10.bam" +"${ORI_BAM_D}/HG003/HG003_80.bam" +"${ORI_BAM_D}/HG004/HG004_80.bam" ) @@ -290,6 +272,7 @@ time ${PARALLEL} --joblog ${LOG_PATH}/S_fiter_hete_snp_pileup${_LOG_SUF}.log -j$ --bed ${_TRIO_BED_PATH} \ --ctgName ${CHR_PREFIX}{1}" ::: ${CHR[@]} ::: ${INPUT_PILEUP_VCF_C[@]} :::+ ${INPUT_PILEUP_VCF_P1[@]} :::+ ${INPUT_PILEUP_VCF_P2[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} :::+ ${DEPTH_S[@]} |& tee ${LOG_PATH}/FHSP${_LOG_SUF}.log + echo "[INFO] Create Tensors" time ${PARALLEL} --joblog ${LOG_PATH}/S_create_tensor${_LOG_SUF}.log -j${THREADS} \ "${PYPY} ${CLAIR3_TRIO} CreateTensorFullAlignment \ @@ -318,20 +301,40 @@ time ${PARALLEL} --joblog ${LOG_PATH}/S_merge_tensors${_LOG_SUF}.log -j${THREADS " ::: ${CHR[@]} ::: ${CHUNK_LIST[@]} ::: ${DEPTH_S[@]} |& tee ${LOG_PATH}/MT${_LOG_SUF}.log +#IF_CHECK_MCV=1 # whether filter MCV in training data +# +#time ${PARALLEL} --joblog ${LOG_PATH}/S_tensor2Bin${_LOG_SUF}.log -j${THREADS} \ +#"${PYTHON3} ${CLAIR3_TRIO} Tensor2Bin_Trio \ +#--tensor_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${TRIO_N}_{3}_{1} \ +#--var_fn_c {4}/var_ru/var_{1} \ +#--var_fn_p1 {5}/var_ru/var_{1} \ +#--var_fn_p2 {6}/var_ru/var_{1} \ +#--bin_fn ${BINS_FOLDER_PATH}/${TRIO_N}_{3}_{1}_{2} \ +#--chunk_id {2} \ +#--chunk_num ${bin_chunk_num} \ +#--platform ${PLATFORM} \ +#--allow_duplicate_chr_pos \ +#--maximum_non_variant_ratio 1.0 \ +#--check_mcv ${IF_CHECK_MCV} \ +#--shuffle" ::: ${CHR[@]} ::: ${BIN_CHUNK_LIST[@]} ::: ${DEPTH_S[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} |& tee ${LOG_PATH}/T2B${_LOG_SUF}.log + +THREADS=16 IF_CHECK_MCV=1 # whether filter MCV in training data +DENOVO_BINS_FOLDER_PATH="${DATASET_FOLDER_PATH}/denovo_t_bins" +mkdir -p ${DENOVO_BINS_FOLDER_PATH} -time ${PARALLEL} --joblog ${LOG_PATH}/S_tensor2Bin${_LOG_SUF}.log -j${THREADS} \ -"${PYTHON3} ${CLAIR3_TRIO} Tensor2Bin_Trio \ +time ${PARALLEL} --resume-failed --joblog ${LOG_PATH}/S_tensor2Bin${_LOG_SUF}_dn.log -j${THREADS} \ +"${PYTHON3} ${CLAIR3_TRIO} Tensor2Bin_Trio_denovo \ --tensor_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${TRIO_N}_{3}_{1} \ --var_fn_c {4}/var_ru/var_{1} \ --var_fn_p1 {5}/var_ru/var_{1} \ --var_fn_p2 {6}/var_ru/var_{1} \ ---bin_fn ${BINS_FOLDER_PATH}/${TRIO_N}_{3}_{1}_{2} \ +--bin_fn ${DENOVO_BINS_FOLDER_PATH}/${TRIO_N}_{3}_{1}_{2} \ --chunk_id {2} \ --chunk_num ${bin_chunk_num} \ --platform ${PLATFORM} \ --allow_duplicate_chr_pos \ --maximum_non_variant_ratio 1.0 \ --check_mcv ${IF_CHECK_MCV} \ ---shuffle" ::: ${CHR[@]} ::: ${BIN_CHUNK_LIST[@]} ::: ${DEPTH_S[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} |& tee ${LOG_PATH}/T2B${_LOG_SUF}.log - +--only_denovo 0 \ +--shuffle" ::: ${CHR[@]} ::: ${BIN_CHUNK_LIST[@]} ::: ${DEPTH_S[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} |& tee ${LOG_PATH}/T2B${_LOG_SUF}_dn.log diff --git a/docs/trio/5_train_all_dn.sh b/docs/trio/5_train_all_dn.sh new file mode 100644 index 0000000..8f7de06 --- /dev/null +++ b/docs/trio/5_train_all_dn.sh @@ -0,0 +1,65 @@ +# Training trio model +PYTHON3=/autofs/bal31/jhsu/home/_env/anaconda3/envs/py38_3090/bin/python +PLATFORM="ont" +# Clair3-Nova's path +CLAIR3_TRIO="/autofs/bal36/jhsu/trio/Clair3-Nova/clair3.py" + +# bins folder for training +ALL_BINS_FOLDER_PATH="/autofs/bal36/jhsu/trio/r1041_hac/4_build_tensors/build/merged_bins_dn/" +TRAIN_FOLDER_PREFIX="/autofs/bal36/jhsu/trio/r1041_hac/5_train/" + +# train on all data init model, 3090 +TRAIN_N="ALL_SUB_1236_NM" +TRAIN_N="ALL_SUB_1368_NM_DN" +MODEL_FOLDER_PATH="${TRAIN_FOLDER_PREFIX}/${TRAIN_N}/" +mkdir -p ${MODEL_FOLDER_PATH} +cd ${MODEL_FOLDER_PATH} + +# training setting +BATCH_SIZE="1600" #training batch size, e.g. 800 +add_indel_length=1 +MODEL_ARC=NN +IF_ADD_MCV_LOSS=0 +MCVLOSS_ALPHA=0 + +# A single GPU is used for model training +export CUDA_VISIBLE_DEVICES="0" + + +MODEL_ALS="Clair3_Trio_Out3_denovo" +time ${PYTHON3} ${CLAIR3_TRIO} Train_Trio \ +--bin_fn ${ALL_BINS_FOLDER_PATH} \ +--ochk_prefix ${MODEL_FOLDER_PATH} \ +--add_indel_length ${add_indel_length} \ +--platform ${PLATFORM} \ +--validation_dataset \ +--learning_rate 0.001 \ +--maxEpoch 30 \ +--model_arc ${MODEL_ARC} \ +--model_cls ${MODEL_ALS} \ +--batch_size ${BATCH_SIZE} \ +--add_mcv_loss ${IF_ADD_MCV_LOSS} \ +--mcv_alpha ${MCVLOSS_ALPHA} \ +--train_denovo 1\ + |& tee ${MODEL_FOLDER_PATH}/train_log +# +##IF_ADD_MCV_LOSS=1 +##MCVLOSS_ALPHA=1 +# +#PRETRAINED_MODEL=/autofs/bal36/jhsu/guppy5/5_train/train/ALL_no_mcv.15 +##PRETRAINED_MODEL=/autofs/bal36/jhsu/guppy5/5_train/train/ALL_data_m2/.15 +#echo "[INFO] Model finetune" +#time ${PYTHON3} ${CLAIR3_TRIO} Train_Trio \ +# --bin_fn ${ALL_BINS_FOLDER_PATH} \ +# --ochk_prefix ${MODEL_FOLDER_PATH} \ +# --add_indel_length ${add_indel_length} \ +# --platform ${PLATFORM} \ +# --validation_dataset \ +# --learning_rate 1e-5 \ +# --maxEpoch 10 \ +# --model_arc ${MODEL_ARC} \ +# --batch_size ${BATCH_SIZE} \ +# --chkpnt_fn ${PRETRAINED_MODEL} \ +# --add_mcv_loss ${IF_ADD_MCV_LOSS} \ +# --mcv_alpha ${MCVLOSS_ALPHA} \ +# |& tee ${MODEL_FOLDER_PATH}/train_log diff --git a/docs/trio/nova_training.md b/docs/trio/nova_training.md new file mode 100644 index 0000000..bfd8686 --- /dev/null +++ b/docs/trio/nova_training.md @@ -0,0 +1,433 @@ +# Train a model for Clair3-nova calling + +--- + +June 23, 2024 + +Clair3-Nova requires a pileup model, the training procedure of which is identical to Clair3, and a nova model, the training procedure of which is shown below. The pileup model can be obtained from [pre-trained](https://github.com/HKU-BAL/Clair3#pre-trained-models) and the training guide is at [here](../pileup_training.md). The nova model's training procedure is shown below, and the example provided uses the GIAB Ashkenazim Trio (HG002/3/4) as input. To get started, it's recommended that you modify the following helper scripts below to train a nova model. + + + + +- [0_generate_trio_bed.sh](0_generate_trio_bed.sh) +- [1 representation_unification_trio.md](representation_unification_trio.md) +- [2_generate_downsample_phased_bam](2_gen_downsample_phased_bam_p.sh) +- [3_generate_downsample_pileup.sh](3_generate_downsample_pileup.sh) +- [4.1_create tensors for equal depths](4_create_tensors.sh) +- [4.2_create tensors for diverse depths](4_create_tensors_sub.sh) +- [4.3_create tensors for de novo cases](4_create_tensors_denovo.sh) +- [4.4_downsample_tensors.sh](4_1_downsample_bin.sh) +- [5_train.sh](5_train_all_dn.sh) + + + +To train a trio model, it's recommended to use data with a minimum coverage above 60x, as the downsampling coverage setting in this training procedure is tailored for data above 60x. + +If you're using the data with a minimum data coverage of 65x, you can find the best downsampling coverage setting for the data in this section: [[ONT Q20+](#q20-data-configuration)]. + + +--- + +## Prerequisites + +- Clair3 installed +- Clair3-Nova installed +- GNU Parallel installed +- Sufficient hard-disk space +- Truth VCF files with representation unification applied (check [here](representation_unification_trio.md) for steps) +- A high-end GPU + +--- + +## Contents + + - [1. Set variables](#1-set-variables) + - [2. Run Clair3 pileup model](#2-run-clair3-pileup-model) + - [3-1 Create and merge trio tensors](#3-1-create-and-merge-trio-tensors) + - [3-2 Downsample all bins](#3-2-downsample-all-bins) + - [4. Train a Clair3-Nova model](#4-train-a-clair3-nova-model) + +--- + +## 0. Prepare required files + +The input files for training Clair3-Nova model include: + +- Reference file: [R] +- child/parents high-confidence BED files for truth variants: [C TRUTH BED], [P1 TRUTH BED], [P2 TRUTH BED] +- BAM and VCF files generated after finishing the steps in [representation_unifivation_trio](https://github.com/HKU-BAL/Clair3-Trio/blob/trio/docs/trio/representation_unification_trio.md) + - phased alignment BAM files of child and parents: [C BAM], [P1 BAM], [P2 BAM] + - child/parents truth VCF files with representation unification applied: [C TRUTH VCF], [P1 TRUTH VCF], [P2 TRUTH VCF] + + +## 1. Set variables + + +The trio model requires phased alignments, which are necessary for training it. + +Please use this [page](representation_unification_trio.md) to run phasing alignments in all of your family samples. + +Note that all samples in the family should be phased before training to ensure the accuracy of the trio model. + +```bash +PARALLEL="[WHATSHAP_PATH]" # e.g. "parallel" +PYPY="[PYPY_PATH]" # e.g. "pypy3" +SAMTOOLS="[SAMTOOLS_PATH]" # e.g. "samtools" +PYTHON3="[PYTHON3_PATH]" # e.g. "python3" +THREADS=8 # threads number +PLATFORM="ont" + +# Clair3 folder +_ORI_CLAIR3="[CLAIR3_PATH]" +_MODEL_DIR="[CLAIR3_MODEL_PATH]" +C3_THREADS=8 # Clair3 threads number + + +# Clair3-Nova's path +CLAIR3_Nova="[CLAIR3-NOVA_PATH]/clair3.py" + +# Creating working folder +TRAIN_FOLDER_PREFIX="[YOU_TRAINING_FOLDER]" +BUILD_N="[DATA_BUILDING_NAME]" # data building data, e.g. "HG002_all" + +# Temporary working directories +TRAIN_FOLDER="${TRAIN_FOLDER_PREFIX}" +mkdir -p ${TRAIN_FOLDER} + +DATASET_FOLDER_PATH="${TRAIN_FOLDER}/build/${BUILD_N}" +TENSOR_CANDIDATE_FOLDER_PATH="${DATASET_FOLDER_PATH}/tensor_can" +BINS_FOLDER_PATH="${DATASET_FOLDER_PATH}/bins" +READ_FOLDER_PATH="${DATASET_FOLDER_PATH}/read_info" +INDEL_PATH="${DATASET_FOLDER_PATH}/alt_info" +SPLIT_BED_PATH="${DATASET_FOLDER_PATH}/split_beds" +PHASE_VCF_PATH="${DATASET_FOLDER_PATH}/phased_vcf" +PHASE_BAM_PATH="${DATASET_FOLDER_PATH}/phased_bam" +LOG_PATH="${DATASET_FOLDER_PATH}/log" +PILEUP_OUTPUT_PATH="${DATASET_FOLDER_PATH}/pileup" +mkdir -p ${DATASET_FOLDER_PATH} +mkdir -p ${TENSOR_CANDIDATE_FOLDER_PATH} +mkdir -p ${BINS_FOLDER_PATH} +mkdir -p ${READ_FOLDER_PATH} +mkdir -p ${INDEL_PATH} +mkdir -p ${SPLIT_BED_PATH} +mkdir -p ${PHASE_VCF_PATH} +mkdir -p ${PHASE_BAM_PATH} +mkdir -p ${LOG_PATH} +mkdir -p ${PILEUP_OUTPUT_PATH} +cd ${DATASET_FOLDER_PATH} + +# log file suffix name +_LOG_SUF="" # log file suffix + +# input files and parameters + +# sample name +ALL_SAMPLE=( +${CHILD_SAMPLE_N} # your child sample name +${P1_SAMPLE_N} # your parenet-1 sample name +${P2_SAMPLE_N} # your parenet-2 sample name +${CHILD_SAMPLE_N} # your child sample name +${P1_SAMPLE_N} # your parenet-1 sample name +${P2_SAMPLE_N} # your parenet-2 sample name +) + +TRIO_N="${CHILD_SAMPLE_N}_TRIO" # your trio name, e.g. HG002_TRIO + +# Note: Here we setting the data coverage as 10x and 30x for example, all following paths of (${ALL_RU_FILE_PATH}, ${ALL_PHASED_BAM_FILE_PATH}, ${ALL_REFERENCE_FILE_PATH}, etc.), need to be set to ${DEPTHS} accordingly. check the 4_create_tensors.sh for more example + +DEPTHS=( # data coverage +10 +10 +10 +30 +10 +10 +) + +# True variants set from Clair3-Trio Representation Unification +# Each line represents one representation-unified path for each input sample +# Note the all path have a folder called **var_ru** +# Check the representation_unification_trio.md page for more information +# For practical concerns, the representation_unification_trio.md require only run once on the highest depth for each sample, while the low coverage can be sampled from the highest coverage data, i.e. merged.bam in the representation_unification folder + +ALL_RU_FILE_PATH=( +"[child representation unified folder]" +"[parent-1 representation unified folder]" +"[parent-2 representation unified folder]" +"[child representation unified folder]" +"[parent-1 representation unified folder]" +"[parent-2 representation unified folder]" +) + +# downsampled RU bam +ALL_PHASED_BAM_FILE_PATH=( +"[child representation unified folder]/${CHILD_SAMPLE_N}_10.bam" +"[parent-1 representation unified folder]/${P1_SAMPLE_N}_10.bam" +"[parent-2 representation unified folder]/${P2_SAMPLE_N}_10.bam" +"[child representation unified folder]/${CHILD_SAMPLE_N}_30.bam" +"[parent-1 representation unified folder]/${P1_SAMPLE_N}_10.bam" +"[parent-2 representation unified folder]/${P2_SAMPLE_N}_10.bam" +) + +ALL_REFERENCE_FILE_PATH=( +"[YOUR_REF_FILE]" +"[YOUR_REF_FILE]" +"[YOUR_REF_FILE]" +"[YOUR_REF_FILE]" +"[YOUR_REF_FILE]" +"[YOUR_REF_FILE]" +) + +ALL_ORI_BED_FILE_PATH=( +"[YOUR_BED_FILE_CHILD]" +"[YOUR_BED_FILE_PARENET1]" +"[YOUR_BED_FILE_PARENET2]" +"[YOUR_BED_FILE_CHILD]" +"[YOUR_BED_FILE_PARENET1]" +"[YOUR_BED_FILE_PARENET2]" +) + +# training chrosome name, and prefix +CHR=(1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 21 22) +CHR_PREFIX="chr" + +# merge trio's bed file +BED2=${ALL_ORI_BED_FILE_PATH[0]} +BED3=${ALL_ORI_BED_FILE_PATH[1]} +BED4=${ALL_ORI_BED_FILE_PATH[2]} +_TRIO_BED_PATH=${DATASET_FOLDER_PATH}/020304.bed + +docker run -v "${_INPUT_DIR}":"${_INPUT_DIR}" biocontainers/bedtools:v2.26.0dfsg-3-deb_cv1 bedtools intersect -a ${BED2} -b ${BED3} > ${_INPUT_DIR}/tmp_out +docker run -v "${_INPUT_DIR}":"${_INPUT_DIR}" biocontainers/bedtools:v2.26.0dfsg-3-deb_cv1 bedtools sort -i ${_INPUT_DIR}/tmp_out > ${_INPUT_DIR}/0203.bed +docker run -v "${_INPUT_DIR}":"${_INPUT_DIR}" biocontainers/bedtools:v2.26.0dfsg-3-deb_cv1 bedtools intersect -a ${_INPUT_DIR}/0203.bed -b ${BED4} > ${_INPUT_DIR}/tmp_out +docker run -v "${_INPUT_DIR}":"${_INPUT_DIR}" biocontainers/bedtools:v2.26.0dfsg-3-deb_cv1 bedtools sort -i ${_INPUT_DIR}/tmp_out > ${_TRIO_BED_PATH} + +ALL_BED_FILE_PATH=( +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +${_TRIO_BED_PATH} +) + +``` +--- + +## 2. Run Clair3 pileup model + +``` +# Run Clair3 pileup model +time ${PARALLEL} -j ${C3_THREADS} --joblog ${LOG_PATH}/input_pileup${_LOG_SUF}.log ${_ORI_CLAIR3}/run_clair3.sh \ + --bam_fn={5} \ + --ref_fn={2} \ + --threads=${C3_THREADS} \ + --platform="ont" \ + --model_path="${_MODEL_DIR}" \ + --output=${PILEUP_OUTPUT_PATH}/{1}_{4} \ + --bed_fn={3} \ + --pileup_only ::: ${ALL_SAMPLE[@]} :::+ ${ALL_REFERENCE_FILE_PATH[@]} :::+ ${ALL_BED_FILE_PATH[@]} :::+ ${DEPTHS[@]} :::+ ${ALL_PHASED_BAM_FILE_PATH[@]} + +``` + +## 3-1 Create and merge trio tensors + +Generate even and uneven coverage combinations as the input for Clair3-Nova model training, here we set it as (child 10x, parent 1 10x, parent2 10x) + (child 30x, parent 1 10x, parent2 10) for exmaple. + +``` + +# Get output pileup vcf path +ALL_PILEUP_VCF_FILE_PATH=( +"${PILEUP_OUTPUT_PATH}/${ALL_SAMPLE[$(($0))]}_${DEPTHS[$(($0))]}/pileup.vcf.gz" +"${PILEUP_OUTPUT_PATH}/${ALL_SAMPLE[$(($1))]}_${DEPTHS[$(($1))]}/pileup.vcf.gz" +"${PILEUP_OUTPUT_PATH}/${ALL_SAMPLE[$(($2))]}_${DEPTHS[$(($2))]}/pileup.vcf.gz" +"${PILEUP_OUTPUT_PATH}/${ALL_SAMPLE[$(($3))]}_${DEPTHS[$(($3))]}/pileup.vcf.gz" +"${PILEUP_OUTPUT_PATH}/${ALL_SAMPLE[$(($4))]}_${DEPTHS[$(($4))]}/pileup.vcf.gz" +"${PILEUP_OUTPUT_PATH}/${ALL_SAMPLE[$(($5))]}_${DEPTHS[$(($5))]}/pileup.vcf.gz" +) + +# set up input for trio +INPUT_PILEUP_VCF_C=() +INPUT_PILEUP_VCF_P1=() +INPUT_PILEUP_VCF_P2=() + +TRUE_RU_FILE_C=() +TRUE_RU_FILE_P1=() +TRUE_RU_FILE_P2=() + +DEPTH_S=() + +# create trio list for candidates input +for i in $(seq 0 $((${#ALL_SAMPLE[@]}-1))) +do + if [ $(($i % 3)) -eq 0]; then + INPUT_PILEUP_VCF_C+=("${ALL_PILEUP_VCF_FILE_PATH[$(($i))]}") + INPUT_PILEUP_VCF_P1+=("${ALL_PILEUP_VCF_FILE_PATH[$(($i+1))]}") + INPUT_PILEUP_VCF_P2+=("${ALL_PILEUP_VCF_FILE_PATH[$(($i+2))]}") + + TRUE_RU_FILE_C+=("${ALL_RU_FILE_PATH[$(($i))]}") + TRUE_RU_FILE_P1+=("${ALL_RU_FILE_PATH[$(($i+1))]}") + TRUE_RU_FILE_P2+=("${ALL_RU_FILE_PATH[$(($i+2))]}") + + DEPTH_S+=("${DEPTHS[$(($i))]}") + fi +done + + +# created tensors chunk number for each chr +chunk_num=20 +CHUNK_LIST=`seq 1 ${chunk_num}` + +# create tensors bin chunk number for each chr +bin_chunk_num=10 +BIN_CHUNK_LIST=`seq 1 ${bin_chunk_num}` + + +# Select trio candidates from pileup candidates using the SelectHetSnp_Trio submodule +time ${PARALLEL} --joblog ${LOG_PATH}/S_fiter_hete_snp_pileup${_LOG_SUF}.log -j${THREADS} \ +"${PYPY} ${CLAIR3_NOVA} SelectHetSnp_Trio \ +--alt_fn_c {2} \ +--alt_fn_p1 {3} \ +--alt_fn_p2 {4} \ +--var_fn_c {5}/var_ru/var_{1} \ +--var_fn_p1 {6}/var_ru/var_{1} \ +--var_fn_p2 {7}/var_ru/var_{1} \ +--split_folder ${SPLIT_BED_PATH} \ +--sampleName ${TRIO_N} \ +--depth {8} \ +--ref_pct_full 0.2 \ +--var_pct_full 1.0 \ +--ref_var_max_ratio 1.0 \ +--for_train 1 \ +--chunk_num ${chunk_num} \ +--bed ${_TRIO_BED_PATH} \ +--ctgName ${CHR_PREFIX}{1}" ::: ${CHR[@]} ::: ${INPUT_PILEUP_VCF_C[@]} :::+ ${INPUT_PILEUP_VCF_P1[@]} :::+ ${INPUT_PILEUP_VCF_P2[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} :::+ ${DEPTH_S[@]} |& tee ${LOG_PATH}/FHSP${_LOG_SUF}.log + +echo "[INFO] Create Tensors" +time ${PARALLEL} --joblog ${LOG_PATH}/S_create_tensor${_LOG_SUF}.log -j${THREADS} \ +"${PYPY} ${CLAIR3_NOVA} CreateTensorFullAlignment \ +--bam_fn {4} \ +--ref_fn {5} \ +--tensor_can_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_{2}_{3}_{1}_{6} \ +--indel_fn ${INDEL_PATH}/{2}_{3}_{1}_{6} \ +--ctgName ${CHR_PREFIX}{1} \ +--samtools ${SAMTOOLS} \ +--platform ${PLATFORM} \ +--full_aln_regions ${SPLIT_BED_PATH}/${TRIO_N}_{3}_{1}_{6} \ +--add_no_phasing_data_training \ +--phasing_info_in_bam" ::: ${CHR[@]} ::: ${ALL_SAMPLE[@]} :::+ ${DEPTHS[@]} :::+ ${ALL_PHASED_BAM_FILE_PATH[@]} :::+ ${ALL_REFERENCE_FILE_PATH[@]} ::: ${CHUNK_LIST[@]} |& tee ${LOG_PATH}/CT${_LOG_SUF}.log + + +# merge trio tensor, noted that merged no depth info +time ${PARALLEL} --joblog ${LOG_PATH}/S_merge_tensors${_LOG_SUF}.log -j${THREADS} \ +"${PYTHON3} ${CLAIR3_NOVA} Merge_Tensors_Trio \ +--tensor_fn_c ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${ALL_SAMPLE[0]}_{3}_{1}_{2} \ +--tensor_fn_p1 ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${ALL_SAMPLE[1]}_{3}_{1}_{2} \ +--tensor_fn_p2 ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${ALL_SAMPLE[2]}_{3}_{1}_{2} \ +--candidate_fn_c ${INDEL_PATH}/${ALL_SAMPLE[0]}_{3}_{1}_{2} \ +--candidate_fn_p1 ${INDEL_PATH}/${ALL_SAMPLE[1]}_{3}_{1}_{2} \ +--candidate_fn_p2 ${INDEL_PATH}/${ALL_SAMPLE[2]}_{3}_{1}_{2} \ +--tensor_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${TRIO_N}_{3}_{1}_{2} \ +--candidate_fn ${INDEL_PATH}/${TRIO_N}_{3}_{1}_{2} \ +" ::: ${CHR[@]} ::: ${CHUNK_LIST[@]} ::: ${DEPTH_S[@]} |& tee ${LOG_PATH}/MT${_LOG_SUF}.log + +IF_CHECK_MCV=0 # whether filter MCV in training data +# setting to generate denovo variant or not, for genreating de novo cases, the script to switch the child and parent 1 in input +# please check the 4_create_tensors_denovo.sh for a complete example +ONLY_DENOVO=0 + +time ${PARALLEL} --joblog ${LOG_PATH}/S_tensor2Bin${_LOG_SUF}.log -j${THREADS} \ +"${PYTHON3} ${CLAIR3_TRIO} Tensor2Bin_Trio_denovo \ +--tensor_fn ${TENSOR_CANDIDATE_FOLDER_PATH}/tensor_can_${TRIO_N}_{3}_{1} \ +--var_fn_c {4}/var_ru/var_{1} \ +--var_fn_p1 {5}/var_ru/var_{1} \ +--var_fn_p2 {6}/var_ru/var_{1} \ +--bin_fn ${BINS_FOLDER_PATH}/${TRIO_N}_{3}_{1}_{2} \ +--chunk_id {2} \ +--chunk_num ${bin_chunk_num} \ +--platform ${PLATFORM} \ +--allow_duplicate_chr_pos \ +--maximum_non_variant_ratio 1.0 \ +--check_mcv ${IF_CHECK_MCV} \ +--only_denovo ${ONLY_DENOVO} \ +--shuffle" ::: ${CHR[@]} ::: ${BIN_CHUNK_LIST[@]} ::: ${DEPTH_S[@]} :::+ ${TRUE_RU_FILE_C[@]} :::+ ${TRUE_RU_FILE_P1[@]} :::+ ${TRUE_RU_FILE_P2[@]} |& tee ${LOG_PATH}/T2B${_LOG_SUF}.log + +``` + +## 3-2 Downsample all bins + +Downsample all bin files for model training, by copying all bin files from even/uneven coverage `${BINS_FOLDER_PATH}` into `$ALL_BINS_FOLDER_PATH`. + +We recommend downsampling bin files, as an example in [4_1_downsample_bin.sh](4_1_downsample_bin.sh). + + +## 4. Train a Clair3-Nova model + +Please set the `${ALL_BINS_FOLDER_PATH}` which contains the target bin files. + +``` +# Training trio model +TRAIN_N="[YOUR_CLAIR3-TRIO_MODEL_NAME]" +MODEL_FOLDER_PATH="${TRAIN_FOLDER_PREFIX}/train/{TRAIN_N}" +mkdir -p ${MODEL_FOLDER_PATH} +cd ${MODEL_FOLDER_PATH} + +# Training setting +BATCH_SIZE="[YOUR_BATCH_SIZE]" #training batch size, e.g. 800 +add_indel_length=1 +MODEL_ARC=NN +MODEL_ALS="Clair3_Trio_Out3_denovo" +IF_ADD_MCV_LOSS=0 +MCVLOSS_ALPHA=0 + +# A single GPU is used for model training +export CUDA_VISIBLE_DEVICES="0" + +echo "[INFO] Model training" +time ${PYTHON3} ${CLAIR3_TRIO} Train_Trio \ +--bin_fn ${ALL_BINS_FOLDER_PATH} \ +--ochk_prefix ${MODEL_FOLDER_PATH}/trio \ +--add_indel_length ${add_indel_length} \ +--platform ${PLATFORM} \ +--validation_dataset \ +--learning_rate 0.001 \ +--maxEpoch 30 \ +--model_arc ${MODEL_ARC} \ +--model_cls ${MODEL_ALS} \ +--batch_size ${BATCH_SIZE} \ +--add_mcv_loss ${IF_ADD_MCV_LOSS} \ +--mcv_alpha ${MCVLOSS_ALPHA} \ +--train_denovo 1 \ + |& tee ${MODEL_FOLDER_PATH}/train_log + +``` + + +## Q20 data configuration + +For a trio data from [Q20+](https://labs.epi2me.io/askenazi-kit14-2022-12/) with a maximum trio coverage (or minimum data coverage) of 65x: + +| sample | coverage | +| ------ | -------- | +| HG002 | 70.13 | +| HG003 | 79.66 | +| HG004 | 64.72 | + + +We recommend generating trio data coverage config as below: + + +| even coverage | HG002 | HG003 | HG004 | +| --------------- | ----- | ----- | ----- | +| | 10 | 10 | 10 | +| | 30 | 30 | 30 | +| | 50 | 50 | 50 | +| | 65 | 65 | 65 | +| | | | | +| uneven coverage | HG002 | HG003 | HG004 | +| | 30 | 10 | 10 | +| | 50 | 10 | 10 | +| | 50 | 30 | 30 | +| | 65 | 10 | 10 | +| | 65 | 30 | 30 | +| | 65 | 50 | 50 | diff --git a/docs/trio/representation_unification_trio.md b/docs/trio/representation_unification_trio.md index 1a9e3b8..9251102 100644 --- a/docs/trio/representation_unification_trio.md +++ b/docs/trio/representation_unification_trio.md @@ -1,10 +1,5 @@ # Clair3 Representation Unification - - - - - ## Introduction This document shows how to unify the representation between the training materials and true variant set. diff --git a/docs/trio/run_test_nova_local_example.sh b/docs/trio/run_test_nova_local_example.sh new file mode 100644 index 0000000..ec9ab06 --- /dev/null +++ b/docs/trio/run_test_nova_local_example.sh @@ -0,0 +1,301 @@ +_MODEL_DIR_C3="/autofs/bal36/jhsu/trio/Clair3-Nova/models/clair3_models/r1041_e82_400bps_hac_v430/" +_MODEL_DIR_C3T="/autofs/bal36/jhsu/trio/Clair3-Nova/models/clair3_nova_models/r1041_e82_400bps_hac_nova/" +_TRIO_M_PREFIX='nova' +_CLAIR3_TRIO_DIR="/autofs/bal36/jhsu/trio/Clair3-Nova" + +_BAM_C=/autofs/bal36/jhsu/trio/r1041_hac/6_test_bam/HG002/HG002_60.bam +_BAM_P1=/autofs/bal36/jhsu/trio/r1041_hac/6_test_bam/HG003/HG003_60.bam +_BAM_P2=/autofs/bal36/jhsu/trio/r1041_hac/6_test_bam/HG004/HG004_60.bam +_SAMPLE_C="HG002" +_SAMPLE_P1="HG003" +_SAMPLE_P2="HG004" +_REF=/autofs/bal36/jhsu/r10/input/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa +_THREADS="36" +_CONTIGS="chr20" + +_OUTPUT_DIR=/autofs/bal36/jhsu/trio/output/demo + +source /autofs/bal36/jhsu/trio/script/data_config_trio.sh + +SAMPLE=( +HG002 +HG003 +HG004 +) + +DEPTH=( +60 +) + +TOOLS='clair3-nova' +_tmp_dir="demo" +#_tmp_dir="t1" +#_tmp_dir="t_with_4khz" +#_tmp_dir="ALL_28" +INPUTBAM_PATH="/autofs/bal36/jhsu/trio/r1041_hac/6_test_bam/" +_RST_PATH=/autofs/bal36/jhsu/trio/output/demo/${TOOLS}/${_tmp_dir}/ +mkdir -p $_RST_PATH +cd $_RST_PATH + + +for _DEPTH in ${DEPTH[@]} +do + +_THREADS=36 + +_BAM_C=${INPUTBAM_PATH}/${SAMPLE[0]}/${SAMPLE[0]}_${_DEPTH}.bam +_BAM_P1=${INPUTBAM_PATH}/${SAMPLE[1]}/${SAMPLE[1]}_${_DEPTH}.bam +_BAM_P2=${INPUTBAM_PATH}/${SAMPLE[2]}/${SAMPLE[2]}_${_DEPTH}.bam +_BAM_C=/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG002/phased_bam/3.bam +_BAM_P1=/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG003/phased_bam/3.bam +_BAM_P2=/autofs/bal36/jhsu/fast5/r1041/hac/1_ru/HG004/phased_bam/3.bam + +_SAMPLE_C=${SAMPLE[0]} +_SAMPLE_P1=${SAMPLE[1]} +_SAMPLE_P2=${SAMPLE[2]} + +_OUTPUT_DIR=${_RST_PATH}/${_DEPTH} +mkdir -p $_OUTPUT_DIR + +_CONTIGS="chr20" +START_POS=100000 +END_POS=300000 + +_CONTIGS="chr3" +START_POS=16902579 +END_POS=16904879 + + +echo -e "${_CONTIGS}\t${START_POS}\t${END_POS}" > quick_demo.bed + + +/usr/bin/time -v ${_CLAIR3_TRIO_DIR}/run_clair3_nova.sh \ + --bam_fn_c=${_BAM_C} \ + --bam_fn_p1=${_BAM_P1} \ + --bam_fn_p2=${_BAM_P2} \ + --output=${_OUTPUT_DIR} \ + --ref_fn=${_REF} \ + --threads=${_THREADS} \ + --model_path_clair3="${_MODEL_DIR_C3}" \ + --model_path_clair3_nova="${_MODEL_DIR_C3T}" \ + --nova_model_prefix="${_TRIO_M_PREFIX}" \ + --sample_name_c=${_SAMPLE_C} \ + --sample_name_p1=${_SAMPLE_P1} \ + --sample_name_p2=${_SAMPLE_P2} \ + --gvcf \ + --gq_bin_size=5 \ + --base_err=0.01 \ + --bed_fn=quick_demo.bed + + + #--ctg_name=chr20 \ + + +# --enable_phasing \ +# --enable_output_haplotagging + +done + + + + + +my_func2() { + +_DEPTH=$1 +TAR_SAMPLE_ID=$2 +TAR_EVALUATE_BED=$3 +TAR_EVALUATE_VCF=$4 +_RST_PATH=$5 + +REF_SDF_FILE_PATH=/autofs/bal36/jhsu/r10/input/GCA_000001405.15_GRCh38_no_alt_analysis_set.sdf +TAR_EVALUATE_SDF=${REF_SDF_FILE_PATH} + +echo ${TAR_SAMPLE_ID} +echo ${TAR_EVALUATE_BED} +echo ${TAR_EVALUATE_VCF} +echo ${TAR_EVALUATE_SDF} + +_CONTIGS="chr20" +START_POS=10000 +END_POS=90000 + +# # happy +_OUTPUT_DIR=${_RST_PATH}/${_DEPTH} + +out_dir="${_OUTPUT_DIR}/happy" +tar_vcf="${_OUTPUT_DIR}/${TAR_SAMPLE_ID}.vcf.gz" +mkdir -p ${out_dir} +echo $out_dir + +docker run \ +--mount type=bind,source=${TAR_EVALUATE_VCF},target=/true_vcf \ +--mount type=bind,source=${TAR_EVALUATE_BED},target=/true_bed \ +--mount type=bind,source=${tar_vcf},target=/tar_vcf \ +-v "/autofs/bal36/jhsu/r10/input/":"/reference" \ +-v "${out_dir}":"/happy" \ +pkrusche/hap.py /opt/hap.py/bin/hap.py \ +/true_vcf \ +/tar_vcf \ +-f /true_bed \ +-r /reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa \ +-o /happy/${TAR_SAMPLE_ID}.output \ +--engine=vcfeval \ +-l ${_CONTIGS}:${START_POS}-${END_POS} \ +--pass-only \ +--threads=8 + +#-l chr20 \ +} +export -f my_func2 + +BCFTOOLS=bcftools +RTG=rtg +_TRIO_PED=/autofs/bal31/jhsu/home/data/giab/trio.ped +REF_SDF_FILE_PATH=/autofs/bal36/jhsu/r10/input/GCA_000001405.15_GRCh38_no_alt_analysis_set.sdf +_TRIO_BED_PATH=/autofs/bal31/jhsu/home/data/giab/020304.bed +_TRIO_GIAB_MERGED=/autofs/bal31/jhsu/home/data/giab/trio/HG002_TRIO_z.vcf.gz +for _DEPTH in ${DEPTH[@]} +do + +mkdir -p ${_RST_PATH}/${_DEPTH}/trio +M_VCF=${_RST_PATH}/${_DEPTH}/trio/${SAMPLE[0]}_TRIO.vcf.gz +# add mendelian annotation +M_VCF_annotated=${_RST_PATH}/${_DEPTH}/trio/${SAMPLE[0]}_TRIO_ann.vcf.gz +# only de novo variants +denovo_VCF=${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}_TRIO_denovo.vcf.gz +# single sample for happy.py +denovo_VCF_s=${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}_TRIO_denovo_s.vcf.gz +# single sample and high dp for happy.py +denovo_VCF_sf=${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}_TRIO_denovo_sf.vcf.gz +denovo_VCF_hs=${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}_TRIO_denovo_hs.vcf.gz + +${BCFTOOLS} merge ${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}.vcf.gz \ +${_RST_PATH}/${_DEPTH}/${SAMPLE[1]}.vcf.gz \ +${_RST_PATH}/${_DEPTH}/${SAMPLE[2]}.vcf.gz \ +--threads 32 -f PASS -0 -m all| ${BCFTOOLS} view -O z -o ${M_VCF} + +${BCFTOOLS} index ${M_VCF} +${BCFTOOLS} view ${M_VCF} -H | wc -l + +${RTG} mendelian -i ${M_VCF} -o ${M_VCF_annotated} --pedigree ${_TRIO_PED} -t ${REF_SDF_FILE_PATH} |& tee ${_RST_PATH}/${_DEPTH}/trio/MDL.log + +${BCFTOOLS} view -i 'INFO/MCV ~ "0/0+0/0->0/1"' ${M_VCF_annotated} -O z -o ${denovo_VCF} +${BCFTOOLS} index ${denovo_VCF} +${BCFTOOLS} view ${denovo_VCF} -s ${SAMPLE[0]} -O z -o ${denovo_VCF_s} +${BCFTOOLS} index ${denovo_VCF_s} + +${BCFTOOLS} view -i "INFO/DNP>0.85" ${denovo_VCF} -O z -o ${denovo_VCF_hs} +${BCFTOOLS} index ${denovo_VCF_hs} + +${BCFTOOLS} view -i "INFO/DNP>0.85" ${denovo_VCF} -s ${SAMPLE[0]} -O z -o ${denovo_VCF_sf} +${BCFTOOLS} index ${denovo_VCF_sf} + +done + + +parallel --keep-order "grep 'violation of Mendelian constraints' ${_RST_PATH}/{1}/trio/MDL.log;" ::: ${DEPTH[@]} | cut -d ' ' -f 1,2 | tr -d '(|)' | cut -d '/' -f 1 +parallel --keep-order "grep 'violation of Mendelian constraints' ${_RST_PATH}/{1}/trio/MDL.log; echo ; echo ;" ::: ${DEPTH[@]} | cut -d ' ' -f 1,2 | tr -d '(|)' | cut -d '/' -f 1 +echo 'FN, TP, FP' + + +SAMPLE=( +HG002 +HG003 +HG004 +HG002_TRIO_denovo_s +) + +ALL_RST_PATH=( +${_RST_PATH} +${_RST_PATH} +${_RST_PATH} +${_RST_PATH} +) + +GIAB_DIR="/autofs/bal31/jhsu/home/data/giab/" +BASELINE_VCF_FILE_PATH_C="${GIAB_DIR}/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" +BASELINE_BED_FILE_PATH_C="${GIAB_DIR}/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +BASELINE_VCF_FILE_PATH_P1="${GIAB_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" +BASELINE_BED_FILE_PATH_P1="${GIAB_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +BASELINE_VCF_FILE_PATH_P2="${GIAB_DIR}/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" +BASELINE_BED_FILE_PATH_P2="${GIAB_DIR}/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" +BASELINE_VCF_FILE_PATH_TRIO="${GIAB_DIR}/trio/HG002_TRIO_z_ann_denovo_s.vcf.gz" +BASELINE_BED_FILE_PATH_TRIO="${GIAB_DIR}/020304.bed" + +ALL_TRUE_BED=( +${BASELINE_BED_FILE_PATH_C} +${BASELINE_BED_FILE_PATH_P1} +${BASELINE_BED_FILE_PATH_P2} +${BASELINE_BED_FILE_PATH_TRIO} +) + +ALL_TRUE_VCF=( +${BASELINE_VCF_FILE_PATH_C} +${BASELINE_VCF_FILE_PATH_P1} +${BASELINE_VCF_FILE_PATH_P2} +${BASELINE_VCF_FILE_PATH_TRIO} +) + +parallel -j 30 my_func2 ::: ${DEPTH[@]} ::: ${SAMPLE[@]} :::+ ${ALL_TRUE_BED[@]} :::+ ${ALL_TRUE_VCF[@]} :::+ ${ALL_RST_PATH[@]} + + +SAMPLE=( +HG002 +HG003 +HG004 +) +parallel --keep-order \ +"echo -n {2},{1}x, ; python3 /autofs/bal31/jhsu/home/projects/clair3_t/scripts/happy.py --happy_vcf_fn ${_RST_PATH}/{1}/happy/{2}.output.vcf.gz" \ +::: ${DEPTH[@]} ::: ${SAMPLE[@]} + + + +SAMPLE=( +HG002_TRIO_denovo_s +) +parallel --keep-order \ +"echo -n {2},{1}x, ; python3 /autofs/bal31/jhsu/home/projects/clair3_t/scripts/happy.py --oz 2 --happy_vcf_fn ${_RST_PATH}/{1}/happy/{2}.output.vcf.gz" \ +::: ${DEPTH[@]} ::: ${SAMPLE[@]} + + + + + +SAMPLE=( +HG002_TRIO_denovo_sf +) + +ALL_RST_PATH=( +${_RST_PATH} +) + +GIAB_DIR="/autofs/bal31/jhsu/home/data/giab/" +# single sample +BASELINE_VCF_FILE_PATH_TRIO="${GIAB_DIR}/trio/HG002_TRIO_z_ann_denovo_s.vcf.gz" +# single sample with filtered +BASELINE_VCF_FILE_PATH_TRIO="${GIAB_DIR}/trio/HG002_TRIO_z_ann_denovo_s_d.vcf.gz" +BASELINE_BED_FILE_PATH_TRIO="${GIAB_DIR}/020304.bed" + +ALL_TRUE_BED=( +${BASELINE_BED_FILE_PATH_TRIO} +) + +ALL_TRUE_VCF=( +${BASELINE_VCF_FILE_PATH_TRIO} +) + +parallel -j 30 my_func2 ::: ${DEPTH[@]} ::: ${SAMPLE[@]} :::+ ${ALL_TRUE_BED[@]} :::+ ${ALL_TRUE_VCF[@]} :::+ ${ALL_RST_PATH[@]} + + +parallel --keep-order \ +"echo -n {2},{1}x, ; python3 /autofs/bal31/jhsu/home/projects/clair3_t/scripts/happy.py --oz 2 --happy_vcf_fn ${_RST_PATH}/{1}/happy/{2}.output.vcf.gz" \ +::: ${DEPTH[@]} ::: ${SAMPLE[@]} + + + + + + + + diff --git a/docs/trio/test_2_c3.sh b/docs/trio/test_2_c3.sh deleted file mode 100644 index f45c7fa..0000000 --- a/docs/trio/test_2_c3.sh +++ /dev/null @@ -1,516 +0,0 @@ -INPUTBAM_PATH="/autofs/bal36/jhsu/trio/output/6_test_bam/" - -SAMPLE=( -HG002 -HG003 -HG004 -) - -DEPTH=( -10 -20 -30 -40 -50 -60 -) - -DEPTH=( -30 -) - - -TOOLS='clair3' -_tmp_dir="ori" - -INPUTBAM_PATH="/autofs/bal36/jhsu/trio/output/6_test_bam/ALL/" -_tmp_dir="ori_wg_t" -_RST_PATH=/autofs/bal36/jhsu/trio/output/7_test_results/${TOOLS}/${_tmp_dir}/ -mkdir $_RST_PATH -cd $_RST_PATH - -# run clair3-trio - -for _DEPTH in ${DEPTH[@]} -do - -# echo ${_DEPTH} -# echo ${_SAMPLE_N} - -# CALL -_THREADS=64 -_THREADS=36 - -HOME_DIR="/autofs/bal36/jhsu/r10" -_REF="${HOME_DIR}/input/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa" - -_BAM_C=${INPUTBAM_PATH}/${SAMPLE[0]}/${SAMPLE[0]}_${_DEPTH}.bam -_BAM_P1=${INPUTBAM_PATH}/${SAMPLE[1]}/${SAMPLE[1]}_${_DEPTH}.bam -_BAM_P2=${INPUTBAM_PATH}/${SAMPLE[2]}/${SAMPLE[2]}_${_DEPTH}.bam - -_SAMPLE_C=${SAMPLE[0]} -_SAMPLE_P1=${SAMPLE[1]} -_SAMPLE_P2=${SAMPLE[2]} - -_OUTPUT_DIR=${_RST_PATH}/${_DEPTH} -mkdir -p $_OUTPUT_DIR -# _OUTPUT_NAME=${_SAMPLE_N}_${_DEPTH}.vcf.gz - -echo $_BAM -echo $_OUTPUT_NAME - -INPUT_DIR=${INPUTBAM_PATH} -OUTPUT_DIR=${_OUTPUT_DIR} - -_MODEL_NAME=r1041_e82_400bps_sup_v430 -_MODEL_PATH=/autofs/bal36/jhsu/r10/models - -#TAR_SAMPLE=${SAMPLE[0]} -#mkdir -p ${_OUTPUT_DIR}/${TAR_SAMPLE} -#docker run -it \ -# -v "${INPUTBAM_PATH}":"/input" \ -# -v "${_OUTPUT_DIR}":"/output" \ -# -v "${_MODEL_PATH}":"/model" \ -# -v "/autofs/bal36/jhsu/r10/input/":"/reference" \ -# hkubal/clair3:latest \ -# /opt/bin/run_clair3.sh \ -# --bam_fn="/input/${TAR_SAMPLE}/${TAR_SAMPLE}_${_DEPTH}.bam" \ -# --ref_fn=/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa \ -# --threads=${_THREADS} \ -# --sample_name=${TAR_SAMPLE} \ -# --platform="ont" \ -# --model_path="/model/${_MODEL_NAME}" \ -# --output=/output/${TAR_SAMPLE}/ -# -#TAR_SAMPLE=${SAMPLE[1]} -#mkdir -p ${_OUTPUT_DIR}/${TAR_SAMPLE} -#docker run -it \ -# -v "${INPUTBAM_PATH}":"/input" \ -# -v "${_OUTPUT_DIR}":"/output" \ -# -v "${_MODEL_PATH}":"/model" \ -# -v "/autofs/bal36/jhsu/r10/input/":"/reference" \ -# hkubal/clair3:latest \ -# /opt/bin/run_clair3.sh \ -# --bam_fn="/input/${TAR_SAMPLE}/${TAR_SAMPLE}_${_DEPTH}.bam" \ -# --ref_fn=/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa \ -# --threads=${_THREADS} \ -# --sample_name=${TAR_SAMPLE} \ -# --platform="ont" \ -# --model_path="/model/${_MODEL_NAME}" \ -# --output=/output/${TAR_SAMPLE}/ -# -#TAR_SAMPLE=${SAMPLE[2]} -#mkdir -p ${_OUTPUT_DIR}/${TAR_SAMPLE} -#docker run -it \ -# -v "${INPUTBAM_PATH}":"/input" \ -# -v "${_OUTPUT_DIR}":"/output" \ -# -v "${_MODEL_PATH}":"/model" \ -# -v "/autofs/bal36/jhsu/r10/input/":"/reference" \ -# hkubal/clair3:latest \ -# /opt/bin/run_clair3.sh \ -# --bam_fn="/input/${TAR_SAMPLE}/${TAR_SAMPLE}_${_DEPTH}.bam" \ -# --ref_fn=/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa \ -# --threads=${_THREADS} \ -# --sample_name=${TAR_SAMPLE} \ -# --platform="ont" \ -# --model_path="/model/${_MODEL_NAME}" \ -# --output=/output/${TAR_SAMPLE}/ -#exit 0; -_CLAIR3_DIR=/autofs/bal36/jhsu/tools/Clair3 -_MODEL_DIR_C3="/autofs/bal36/jhsu/trio/models/rerio/clair3_models/r1041_e82_400bps_sup_v430/" - - -${_CLAIR3_DIR}/run_clair3.sh \ - --bam_fn=${_BAM_C} \ - --output=${_OUTPUT_DIR}/${SAMPLE[0]} \ - --ref_fn=${_REF} \ - --threads=${_THREADS} \ - --model_path="${_MODEL_DIR_C3}" \ - --sample_name=${_SAMPLE_C} \ - --platform="ont" \ - - - --ctg_name=chr20 - -${_CLAIR3_DIR}/run_clair3.sh \ - --bam_fn=${_BAM_P1} \ - --output=${_OUTPUT_DIR}/${SAMPLE[1]} \ - --ref_fn=${_REF} \ - --threads=${_THREADS} \ - --model_path="${_MODEL_DIR_C3}" \ - --sample_name=${_SAMPLE_P1} \ - --platform="ont" \ - - --ctg_name=chr20 - -${_CLAIR3_DIR}/run_clair3.sh \ - --bam_fn=${_BAM_P2} \ - --output=${_OUTPUT_DIR}/${SAMPLE[2]} \ - --ref_fn=${_REF} \ - --threads=${_THREADS} \ - --model_path="${_MODEL_DIR_C3}" \ - --sample_name=${_SAMPLE_P2} \ - --platform="ont" \ - - --ctg_name=chr20 - -done - - - -#my_func2() { -# -#TOOLS='clair3' -#_tmp_dir="ori" -#_RST_PATH=/autofs/bal36/jhsu/trio/output/7_test_results/${TOOLS}/${_tmp_dir}/ -# -#ALL_SAMPLE=( -#HG002 -#HG003 -#HG004 -#) -# -#ALL_SAMPLE_N=3 -# -#GIAB_DIR="/autofs/bal31/jhsu/home/data/giab/" -#BASELINE_VCF_FILE_PATH_C="${GIAB_DIR}/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" -#BASELINE_BED_FILE_PATH_C="${GIAB_DIR}/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -#BASELINE_VCF_FILE_PATH_P1="${GIAB_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" -#BASELINE_BED_FILE_PATH_P1="${GIAB_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -#BASELINE_VCF_FILE_PATH_P2="${GIAB_DIR}/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" -#BASELINE_BED_FILE_PATH_P2="${GIAB_DIR}/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -#REF_SDF_FILE_PATH=/autofs/bal36/jhsu/r10/input/GCA_000001405.15_GRCh38_no_alt_analysis_set.sdf -# -#ALL_TRUE_BED=( -#${BASELINE_BED_FILE_PATH_C} -#${BASELINE_BED_FILE_PATH_P1} -#${BASELINE_BED_FILE_PATH_P2} -#) -# -#ALL_TRUE_VCF=( -#${BASELINE_VCF_FILE_PATH_C} -#${BASELINE_VCF_FILE_PATH_P1} -#${BASELINE_VCF_FILE_PATH_P2} -#) -# -# -#_DEPTH=$1 -#_N=$2 -# -#TAR_SAMPLE_ID=${ALL_SAMPLE[_N-1]} -#TAR_EVALUATE_BED=${ALL_TRUE_BED[_N-1]} -#TAR_EVALUATE_VCF=${ALL_TRUE_VCF[_N-1]} -#TAR_EVALUATE_SDF=${REF_SDF_FILE_PATH} -# -#echo ${_N} -#echo ${TAR_SAMPLE_ID} -#echo ${TAR_EVALUATE_BED} -#echo ${TAR_EVALUATE_VCF} -#echo ${TAR_EVALUATE_SDF} -# -# -## # index -# -## # happy -# -#_OUTPUT_DIR=${_RST_PATH}/${_DEPTH} -# -#out_dir="${_OUTPUT_DIR}/${TAR_SAMPLE_ID}/happy" -#tar_vcf="${_OUTPUT_DIR}/${TAR_SAMPLE_ID}/merge_output.vcf.gz" -#echo $out_dir -#mkdir -p ${out_dir} -# -# -# -#docker run \ -#--mount type=bind,source=${TAR_EVALUATE_VCF},target=/true_vcf \ -#--mount type=bind,source=${TAR_EVALUATE_BED},target=/true_bed \ -#--mount type=bind,source=${tar_vcf},target=/tar_vcf \ -#-v "/autofs/bal36/jhsu/r10/input/":"/reference" \ -#-v "${out_dir}":"/happy" \ -#pkrusche/hap.py /opt/hap.py/bin/hap.py \ -#/true_vcf \ -#/tar_vcf \ -#-f /true_bed \ -#-r /reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa \ -#-o /happy/${TAR_SAMPLE_ID}.output \ -#--engine=vcfeval \ -#-l chr20 \ -#--pass-only \ -#--threads=8 -# -#} -# -# -#export -f my_func2 -## DEPTH=( -## 50 -## 60 -## 70 -## ) -#ALL_SAMPLE_N=3 -#parallel -j 30 my_func2 ::: ${DEPTH[@]} ::: `seq ${ALL_SAMPLE_N}` -# -#TOOLS='clair3' -#_tmp_dir="ori" -#_RST_PATH=/autofs/bal36/jhsu/trio/output/7_test_results/${TOOLS}/${_tmp_dir}/ -# -# -#parallel --keep-order \ -#"echo -n {2},{1}x, ; python3 /autofs/bal31/jhsu/home/projects/clair3_t/scripts/happy.py --happy_vcf_fn ${_RST_PATH}/{1}/{2}/happy/{2}.output.vcf.gz" \ -#::: ${DEPTH[@]} ::: ${SAMPLE[@]} -# -# -##set -x; -# -#BCFTOOLS=bcftools -#RTG=rtg -#_TRIO_PED=/autofs/bal31/jhsu/home/data/giab/trio.ped -#REF_SDF_FILE_PATH=/autofs/bal36/jhsu/r10/input/GCA_000001405.15_GRCh38_no_alt_analysis_set.sdf -#_TRIO_BED_PATH=/autofs/bal31/jhsu/home/data/giab/020304.bed -#_TRIO_GIAB_MERGED=/autofs/bal31/jhsu/home/data/giab/trio/HG002_TRIO_z.vcf.gz -# -#for _DEPTH in ${DEPTH[@]} -#do -#mkdir -p ${_RST_PATH}/${_DEPTH}/trio -#M_VCF=${_RST_PATH}/${_DEPTH}/trio/${SAMPLE[0]}_TRIO.vcf.gz -#M_VCF_annotated=${_RST_PATH}/${_DEPTH}/trio/${SAMPLE[0]}_TRIO_ann.vcf.gz -# -#${BCFTOOLS} merge ${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}/merge_output.vcf.gz \ -#${_RST_PATH}/${_DEPTH}/${SAMPLE[1]}/merge_output.vcf.gz \ -#${_RST_PATH}/${_DEPTH}/${SAMPLE[2]}/merge_output.vcf.gz \ -#--threads 32 -f PASS -0 -m all| ${BCFTOOLS} view -O z -o ${M_VCF} -# -#${BCFTOOLS} index ${M_VCF} -#${BCFTOOLS} view ${M_VCF} -H | wc -l -# -#${RTG} mendelian -i ${M_VCF} -o ${M_VCF_annotated} --pedigree ${_TRIO_PED} -t ${REF_SDF_FILE_PATH} |& tee ${_RST_PATH}/${_DEPTH}/trio/MDL.log -# -# -#PYPY=pypy -#_CLAIR3_TRIO_DIR=/autofs/bal31/jhsu/home/projects/git/clair3-trio-clean -#${PYPY} ${_CLAIR3_TRIO_DIR}/clair3.py Check_de_novo --call_vcf ${M_VCF} --ctgName chr20 --bed_fn $_TRIO_BED_PATH --true_vcf $_TRIO_GIAB_MERGED |& tee ${_RST_PATH}/${_DEPTH}/trio/denovo_rst -#done -# -# -## parallel --keep-order "grep 'violation of Mendelian constraints' ${CALL_PATH}/trio/{1}_MDL.log" ::: ${DEPTH[@]} | cut -d ' ' -f 1,2 | tr -d '(|)' -#parallel --keep-order "grep 'violation of Mendelian constraints' ${_RST_PATH}/{1}/trio/MDL.log;" ::: ${DEPTH[@]} | cut -d ' ' -f 1,2 | tr -d '(|)' | cut -d '/' -f 1 -#parallel --keep-order "grep 'violation of Mendelian constraints' ${_RST_PATH}/{1}/trio/MDL.log; echo ; echo ;" ::: ${DEPTH[@]} | cut -d ' ' -f 1,2 | tr -d '(|)' | cut -d '/' -f 1 -#echo 'FN, TP, FP' -#parallel --keep-order "grep 'TP' ${_RST_PATH}/{1}/trio/denovo_rst" ::: ${DEPTH[@]} | cut -d ':' -f 2 -#parallel --keep-order "grep 'TP' ${_RST_PATH}/{1}/trio/denovo_rst; echo ; echo ;" ::: ${DEPTH[@]} | cut -d ':' -f 2 - - - -my_func2() { - -_DEPTH=$1 -TAR_SAMPLE_ID=$2 -TAR_EVALUATE_BED=$3 -TAR_EVALUATE_VCF=$4 -_RST_PATH=$5 - -REF_SDF_FILE_PATH=/autofs/bal36/jhsu/r10/input/GCA_000001405.15_GRCh38_no_alt_analysis_set.sdf -TAR_EVALUATE_SDF=${REF_SDF_FILE_PATH} - -echo ${TAR_SAMPLE_ID} -echo ${TAR_EVALUATE_BED} -echo ${TAR_EVALUATE_VCF} -echo ${TAR_EVALUATE_SDF} - -# # happy -_OUTPUT_DIR=${_RST_PATH}/${_DEPTH} - -out_dir="${_OUTPUT_DIR}/happy" - -#tar_vcf="${_OUTPUT_DIR}/${TAR_SAMPLE_ID}/merge_output.vcf.gz" -tar_vcf="${_OUTPUT_DIR}/${TAR_SAMPLE_ID}.vcf.gz" -mkdir -p ${out_dir} -echo $out_dir - -docker run \ ---mount type=bind,source=${TAR_EVALUATE_VCF},target=/true_vcf \ ---mount type=bind,source=${TAR_EVALUATE_BED},target=/true_bed \ ---mount type=bind,source=${tar_vcf},target=/tar_vcf \ --v "/autofs/bal36/jhsu/r10/input/":"/reference" \ --v "${out_dir}":"/happy" \ -pkrusche/hap.py /opt/hap.py/bin/hap.py \ -/true_vcf \ -/tar_vcf \ --f /true_bed \ --r /reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fa \ --o /happy/${TAR_SAMPLE_ID}.output \ ---engine=vcfeval \ --l chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr21,chr22 \ ---pass-only \ ---threads=8 - -#-l chr20 \ -} -export -f my_func2 -# DEPTH=( -# 50 -# 60 -# 70 -# ) - -BCFTOOLS=bcftools -RTG=rtg -_TRIO_PED=/autofs/bal31/jhsu/home/data/giab/trio.ped -REF_SDF_FILE_PATH=/autofs/bal36/jhsu/r10/input/GCA_000001405.15_GRCh38_no_alt_analysis_set.sdf -_TRIO_BED_PATH=/autofs/bal31/jhsu/home/data/giab/020304.bed -_TRIO_GIAB_MERGED=/autofs/bal31/jhsu/home/data/giab/trio/HG002_TRIO_z.vcf.gz -for _DEPTH in ${DEPTH[@]} -do - -mkdir -p ${_RST_PATH}/${_DEPTH}/trio -M_VCF=${_RST_PATH}/${_DEPTH}/trio/${SAMPLE[0]}_TRIO.vcf.gz -M_VCF_annotated=${_RST_PATH}/${_DEPTH}/trio/${SAMPLE[0]}_TRIO_ann.vcf.gz -denovo_VCF=${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}_TRIO_denovo.vcf.gz -denovo_VCF_s=${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}_TRIO_denovo_s.vcf.gz -denovo_VCF_sf=${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}_TRIO_denovo_sf.vcf.gz - - -#tar_vcf="${_OUTPUT_DIR}/${TAR_SAMPLE_ID}/merge_output.vcf.gz" -_TAR_SAMPLE=${SAMPLE[0]} -cp ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}/merge_output.vcf.gz ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}.vcf.gz -${BCFTOOLS} index ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}.vcf.gz -_TAR_SAMPLE=${SAMPLE[1]} -cp ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}/merge_output.vcf.gz ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}.vcf.gz -${BCFTOOLS} index ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}.vcf.gz -_TAR_SAMPLE=${SAMPLE[2]} - -cp ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}/merge_output.vcf.gz ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}.vcf.gz -${BCFTOOLS} index ${_RST_PATH}/${_DEPTH}/${_TAR_SAMPLE}.vcf.gz - -${BCFTOOLS} merge ${_RST_PATH}/${_DEPTH}/${SAMPLE[0]}.vcf.gz \ -${_RST_PATH}/${_DEPTH}/${SAMPLE[1]}.vcf.gz \ -${_RST_PATH}/${_DEPTH}/${SAMPLE[2]}.vcf.gz \ ---threads 32 -f PASS -0 -m all| ${BCFTOOLS} view -O z -o ${M_VCF} - -${BCFTOOLS} index ${M_VCF} -${BCFTOOLS} view ${M_VCF} -H | wc -l - -${RTG} mendelian -i ${M_VCF} -o ${M_VCF_annotated} --pedigree ${_TRIO_PED} -t ${REF_SDF_FILE_PATH} |& tee ${_RST_PATH}/${_DEPTH}/trio/MDL.log - -${BCFTOOLS} view -i 'INFO/MCV ~ "0/0+0/0->0/1"' ${M_VCF_annotated} -O z -o ${denovo_VCF} -${BCFTOOLS} index ${denovo_VCF} -${BCFTOOLS} view ${denovo_VCF} -s ${SAMPLE[0]} -O z -o ${denovo_VCF_s} -${BCFTOOLS} index ${denovo_VCF_s} - - -#tabix -p vcf ${denovo_VCF} - -PYPY=pypy -_CLAIR3_TRIO_DIR=/autofs/bal31/jhsu/home/projects/git/clair3-trio-clean -${PYPY} ${_CLAIR3_TRIO_DIR}/clair3.py Check_de_novo --call_vcf ${M_VCF} --ctgName chr20 --bed_fn $_TRIO_BED_PATH --true_vcf $_TRIO_GIAB_MERGED |& tee ${_RST_PATH}/${_DEPTH}/trio/denovo_rst -done - - -# parallel --keep-order "grep 'violation of Mendelian constraints' ${CALL_PATH}/trio/{1}_MDL.log" ::: ${DEPTH[@]} | cut -d ' ' -f 1,2 | tr -d '(|)' -parallel --keep-order "grep 'violation of Mendelian constraints' ${_RST_PATH}/{1}/trio/MDL.log;" ::: ${DEPTH[@]} | cut -d ' ' -f 1,2 | tr -d '(|)' | cut -d '/' -f 1 -parallel --keep-order "grep 'violation of Mendelian constraints' ${_RST_PATH}/{1}/trio/MDL.log; echo ; echo ;" ::: ${DEPTH[@]} | cut -d ' ' -f 1,2 | tr -d '(|)' | cut -d '/' -f 1 -echo 'FN, TP, FP' -parallel --keep-order "grep 'TP' ${_RST_PATH}/{1}/trio/denovo_rst" ::: ${DEPTH[@]} | cut -d ':' -f 2 -parallel --keep-order "grep 'TP' ${_RST_PATH}/{1}/trio/denovo_rst; echo ; echo ;" ::: ${DEPTH[@]} | cut -d ':' -f 2 - - - -SAMPLE=( -HG002 -HG003 -HG004 -HG002_TRIO_denovo_s -) - -ALL_RST_PATH=( -${_RST_PATH} -${_RST_PATH} -${_RST_PATH} -${_RST_PATH} -) - -GIAB_DIR="/autofs/bal31/jhsu/home/data/giab/" -BASELINE_VCF_FILE_PATH_C="${GIAB_DIR}/HG002_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" -BASELINE_BED_FILE_PATH_C="${GIAB_DIR}/HG002_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -BASELINE_VCF_FILE_PATH_P1="${GIAB_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" -BASELINE_BED_FILE_PATH_P1="${GIAB_DIR}/HG003_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -BASELINE_VCF_FILE_PATH_P2="${GIAB_DIR}/HG004_GRCh38_1_22_v4.2.1_benchmark.vcf.gz" -BASELINE_BED_FILE_PATH_P2="${GIAB_DIR}/HG004_GRCh38_1_22_v4.2.1_benchmark_noinconsistent.bed" -BASELINE_VCF_FILE_PATH_TRIO="${GIAB_DIR}/trio/HG002_TRIO_z_ann_denovo_s.vcf.gz" -BASELINE_BED_FILE_PATH_TRIO="${GIAB_DIR}/020304.bed" - -ALL_TRUE_BED=( -${BASELINE_BED_FILE_PATH_C} -${BASELINE_BED_FILE_PATH_P1} -${BASELINE_BED_FILE_PATH_P2} -${BASELINE_BED_FILE_PATH_TRIO} -) - -ALL_TRUE_VCF=( -${BASELINE_VCF_FILE_PATH_C} -${BASELINE_VCF_FILE_PATH_P1} -${BASELINE_VCF_FILE_PATH_P2} -${BASELINE_VCF_FILE_PATH_TRIO} -) - -parallel -j 30 my_func2 ::: ${DEPTH[@]} ::: ${SAMPLE[@]} :::+ ${ALL_TRUE_BED[@]} :::+ ${ALL_TRUE_VCF[@]} :::+ ${ALL_RST_PATH[@]} - - -SAMPLE=( -HG002 -HG003 -HG004 -) -parallel --keep-order \ -"echo -n {2},{1}x, ; python3 /autofs/bal31/jhsu/home/projects/clair3_t/scripts/happy.py --happy_vcf_fn ${_RST_PATH}/{1}/happy/{2}.output.vcf.gz" \ -::: ${DEPTH[@]} ::: ${SAMPLE[@]} - - - -SAMPLE=( -HG002_TRIO_denovo_s -) -parallel --keep-order \ -"echo -n {2},{1}x, ; python3 /autofs/bal31/jhsu/home/projects/clair3_t/scripts/happy.py --oz 2 --happy_vcf_fn ${_RST_PATH}/{1}/happy/{2}.output.vcf.gz" \ -::: ${DEPTH[@]} ::: ${SAMPLE[@]} - - - - -SAMPLE=( -HG002_TRIO_denovo_s -) - -ALL_RST_PATH=( -${_RST_PATH} -) - -GIAB_DIR="/autofs/bal31/jhsu/home/data/giab/" -# single sample -BASELINE_VCF_FILE_PATH_TRIO="${GIAB_DIR}/trio/HG002_TRIO_z_ann_denovo_s.vcf.gz" -# single sample with filtered -BASELINE_VCF_FILE_PATH_TRIO="${GIAB_DIR}/trio/HG002_TRIO_z_ann_denovo_s_d.vcf.gz" -BASELINE_BED_FILE_PATH_TRIO="${GIAB_DIR}/020304.bed" - -ALL_TRUE_BED=( -${BASELINE_BED_FILE_PATH_TRIO} -) - -ALL_TRUE_VCF=( -${BASELINE_VCF_FILE_PATH_TRIO} -) - -#parallel -j 30 my_func2 ::: ${DEPTH[@]} ::: `seq ${ALL_SAMPLE_N}` :::+ ${ALL_RST_PATH} -parallel -j 30 my_func2 ::: ${DEPTH[@]} ::: ${SAMPLE[@]} :::+ ${ALL_TRUE_BED[@]} :::+ ${ALL_TRUE_VCF[@]} :::+ ${ALL_RST_PATH[@]} - - -parallel --keep-order \ -"echo -n {2},{1}x, ; python3 /autofs/bal31/jhsu/home/projects/clair3_t/scripts/happy.py --oz 2 --happy_vcf_fn ${_RST_PATH}/{1}/happy/{2}.output.vcf.gz" \ -::: ${DEPTH[@]} ::: ${SAMPLE[@]} - - - - - - diff --git a/run_clair3_nova.sh b/run_clair3_nova.sh index a09e37e..747ddc5 100755 --- a/run_clair3_nova.sh +++ b/run_clair3_nova.sh @@ -5,7 +5,7 @@ Usage="Usage: ./${SCRIPT_NAME} --bam_fn_c=BAM --bam_fn_p1=BAM --bam_fn_p2=BAM -- CMD="$0 $@" # ENTRANCE SCRIPT FOR CLAIR3-TRIO, SETTING VARIABLE AND CALL TRIO -VERSION='v0.1' +VERSION='v0.3' set -e print_help_messages() @@ -45,8 +45,8 @@ print_help_messages() echo $'--chunk_size=INT The size of each chuck for parallel processing, default: 5000000.' echo $'--print_ref_calls Show reference calls (0/0) in VCF file, default: disable.' echo $'--include_all_ctgs Call variants on all contigs, otherwise call in chr{1..22,X,Y} and {1..22,X,Y}, default: disable.' - echo $'--snp_min_af=FLOAT Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08,hifi:0.08,ilmn:0.08.' - echo $'--indel_min_af=FLOAT Minimum Indel AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15,hifi:0.08,ilmn:0.08.' + echo $'--snp_min_af=FLOAT Minimum SNP AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.08.' + echo $'--indel_min_af=FLOAT Minimum Indel AF required for a candidate variant. Lowering the value might increase a bit of sensitivity in trade of speed and accuracy, default: ont:0.15.' echo $'--nova_model_prefix=STR Model prefix in trio & nova calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index, default: nova.' echo $'--enable_output_phasing Output phased variants using whatshap, default: disable.' echo $'--enable_output_haplotagging Output enable_output_haplotagging BAM variants using whatshap, default: disable.' @@ -55,6 +55,9 @@ print_help_messages() echo $'--ref_pct_full=FLOAT EXPERIMENTAL: Specify an expected percentage of low quality 0/0 variants called in the pileup mode for full-alignment mode calling, default: 0.1 .' echo $'--var_pct_phasing=FLOAT EXPERIMENTAL: Specify an expected percentage of high quality 0/1 variants used in WhatsHap phasing, default: 0.8 for ont guppy5 and 0.7 for other platforms.' echo $'--pileup_model_prefix=STR EXPERIMENTAL: Model prefix in pileup calling, including $prefix.data-00000-of-00002, $prefix.data-00001-of-00002 $prefix.index. default: pileup.' + echo $'--keep_iupac_bases EXPERIMENTAL: Keep IUPAC reference and alternate bases, default: convert all IUPAC bases to N.' + echo $'--base_err=FLOAT EXPERIMENTAL: Estimated base error rate when enabling gvcf option, default: 0.05 (set smaller will increase reduce the number of ./. output).' + echo $'--gq_bin_size=INT EXPERIMENTAL: Default gq bin size for merge non-variant block when enabling gvcf option, default: 5.' echo $'' } @@ -71,7 +74,7 @@ NC="\\033[0m" ARGS=`getopt -o b:f:t:p:o:hv \ -l bam_fn_c:,bam_fn_p1:,bam_fn_p2:,ref_fn:,threads:,model_path_clair3:,model_path_clair3_nova:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name_c::,sample_name_p1::,sample_name_p2::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,ref_pct_full::,var_pct_phasing::,\ -resumn::,snp_min_af::,indel_min_af::,pileup_model_prefix::,nova_model_prefix::,fast_mode,gvcf,pileup_only,pileup_phasing,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,no_phasing_for_fa,call_snp_only,remove_intermediate_dir,enable_output_phasing,enable_output_haplotagging,enable_phasing,enable_long_indel,gvcf,help,version -n 'run_clair3_nova.sh' -- "$@"` +keep_iupac_bases,base_err::,gq_bin_size::,resumn::,snp_min_af::,indel_min_af::,pileup_model_prefix::,nova_model_prefix::,fast_mode,gvcf,pileup_only,pileup_phasing,print_ref_calls,haploid_precise,haploid_sensitive,include_all_ctgs,no_phasing_for_fa,call_snp_only,remove_intermediate_dir,enable_output_phasing,enable_output_haplotagging,enable_phasing,enable_long_indel,gvcf,help,version -n 'run_clair3_nova.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi eval set -- "${ARGS}" @@ -115,6 +118,9 @@ ENABLE_OUTPUT_HAPLOTAGGING=False ENABLE_LONG_INDEL=False PILEUP_PREFIX="pileup" NOVA_PREFIX="nova" +BASE_ERR=0.05 +GQ_BIN_SIZE=5 +KEEP_IUPAC_BASES=False while true; do case "$1" in @@ -149,6 +155,9 @@ while true; do --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; --nova_model_prefix ) NOVA_PREFIX="$2"; shift 2 ;; --gvcf ) GVCF=True; shift 1 ;; + --base_err ) BASE_ERR="$2"; shift 2 ;; + --gq_bin_size ) GQ_BIN_SIZE="$2"; shift 2 ;; + --keep_iupac_bases ) KEEP_IUPAC_BASES=True; shift 1 ;; --resumn ) RESUMN="$2"; shift 2 ;; --pileup_only ) PILEUP_ONLY=True; shift 1 ;; --pileup_phasing ) PILEUP_PHASING=True; shift 1 ;; @@ -270,6 +279,9 @@ echo "[INFO] ENABLE OUTPUT GVCF: ${GVCF}" echo "[INFO] ENABLE INCLUDE ALL CTGS CALLING: ${INCLUDE_ALL_CTGS}" echo "[INFO] ENABLE PHASING VCF OUTPUT: ${ENABLE_OUTPUT_PHASING}" echo "[INFO] ENABLE HAPLOTAGGING BAM OUTPUT: ${ENABLE_OUTPUT_HAPLOTAGGING}" +echo "[INFO] KEEP IUPAC BASE IN VCF OUTPUT: ${KEEP_IUPAC_BASES}" +echo "[INFO] BASE ERROR IN GVCF: ${BASE_ERR}" +echo "[INFO] GQ BIN SIZE IN GVCF: ${GQ_BIN_SIZE}" echo $'' # file check @@ -325,6 +337,9 @@ if [ -z ${PHASING_PCT} ]; then echo -e "${ERROR} Use '--var_pct_phasing=FLOAT' i if [ -z ${PILEUP_PREFIX} ]; then echo -e "${ERROR} Use '--pileup_model_prefix=STR' instead of '--pileup_model_prefix STR' for optional parameters${NC}"; exit 1 ; fi if [ -z ${NOVA_PREFIX} ]; then echo -e "${ERROR} Use '--nova_model_prefix=STR' instead of '--nova_model_prefix STR' for optional parameters${NC}"; exit 1 ; fi if [ -z ${RESUMN} ]; then echo -e "${ERROR} Use '--resumn=0,1,2,3,4'for optional parameters${NC}"; exit 1 ; fi +if [ -z ${BASE_ERR} ]; then echo -e "${ERROR} Use '--base_err=FLOAT' instead of '--base_err FLOAT' for optional parameters${NC}"; exit 1 ; fi +if [ -z ${GQ_BIN_SIZE} ]; then echo -e "${ERROR} Use '--gq_bin_size=INT' instead of '--gq_bin_size INT' for optional parameters${NC}"; exit 1 ; fi + # model prefix detection if [ ! -f ${MODEL_PATH_C3D}/${NOVA_PREFIX}.index ]; then echo -e "${ERROR} No nova model found in provided model path and model prefix ${MODEL_PATH_C3D}/${NOVA_PREFIX} ${NC}"; exit 1; fi @@ -378,7 +393,10 @@ ${SCRIPT_PATH}/trio/Call_Clair3_Nova.sh \ --remove_intermediate_dir=${RM_TMP_DIR} \ --enable_phasing=${ENABLE_OUTPUT_PHASING} \ --enable_output_haplotagging=${ENABLE_OUTPUT_HAPLOTAGGING} \ - --enable_long_indel=${ENABLE_LONG_INDEL} + --enable_long_indel=${ENABLE_LONG_INDEL} \ + --keep_iupac_bases=${KEEP_IUPAC_BASES} \ + --base_err=${BASE_ERR} \ + --gq_bin_size=${GQ_BIN_SIZE} \ )) |& tee ${OUTPUT_FOLDER}/run_clair3_nova.log diff --git a/scripts/clair3.sh b/scripts/clair3.sh index a7256da..a955295 100755 --- a/scripts/clair3.sh +++ b/scripts/clair3.sh @@ -8,7 +8,7 @@ ARGS=`getopt -o b:f:t:m:p:o:r::c::s::h::g \ -l bam_fn:,ref_fn:,threads:,model_path:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\ snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,pileup_phasing::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ -no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_long_indel:: -n 'run_clair3.sh' -- "$@"` +no_phasing_for_fa::,pileup_model_prefix::,fa_model_prefix::,base_err::,gq_bin_size::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,keep_iupac_bases::,enable_long_indel:: -n 'run_clair3.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi eval set -- "${ARGS}" @@ -46,6 +46,8 @@ while true; do --indel_min_af ) INDEL_AF="$2"; shift 2 ;; --pileup_model_prefix ) PILEUP_PREFIX="$2"; shift 2 ;; --fa_model_prefix ) FA_PREFIX="$2"; shift 2 ;; + --base_err ) BASE_ERR="$2"; shift 2 ;; + --gq_bin_size ) GQ_BIN_SIZE="$2"; shift 2 ;; --haploid_precise ) HAP_PRE="$2"; shift 2 ;; --haploid_sensitive ) HAP_SEN="$2"; shift 2 ;; --include_all_ctgs ) INCLUDE_ALL_CTGS="$2"; shift 2 ;; @@ -53,6 +55,7 @@ while true; do --remove_intermediate_dir ) RM_TMP_DIR="$2"; shift 2 ;; --enable_phasing ) ENABLE_PHASING="$2"; shift 2 ;; --enable_long_indel ) ENABLE_LONG_INDEL="$2"; shift 2 ;; + --keep_iupac_bases ) KEEP_IUPAC_BASES="$2"; shift 2 ;; -- ) shift; break; ;; -h|--help ) print_help_messages; break ;; @@ -135,11 +138,14 @@ time ${PARALLEL} --retries ${RETRIES} -C ' ' --joblog ${LOG_PATH}/parallel_1_cal --indel_min_af ${INDEL_AF} \ --call_snp_only ${SNP_ONLY} \ --gvcf ${GVCF} \ + --base_err ${BASE_ERR} \ + --gq_bin_size ${GQ_BIN_SIZE} \ --enable_long_indel ${ENABLE_LONG_INDEL} \ --python ${PYTHON} \ --pypy ${PYPY} \ --samtools ${SAMTOOLS} \ --temp_file_dir ${GVCF_TMP_PATH} \ + --keep_iupac_bases ${KEEP_IUPAC_BASES} \ --pileup" :::: ${OUTPUT_FOLDER}/tmp/CHUNK_LIST |& tee ${LOG_PATH}/1_call_var_bam_pileup.log ${PYPY} ${CLAIR3} SortVcf \ @@ -255,6 +261,7 @@ time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_6_call_var_b --python ${PYTHON} \ --pypy ${PYPY} \ --samtools ${SAMTOOLS} \ + --keep_iupac_bases ${KEEP_IUPAC_BASES} \ --platform ${PLATFORM}" :::: ${CANDIDATE_BED_PATH}/FULL_ALN_FILES |& tee ${LOG_PATH}/6_call_var_bam_full_alignment.log ${PYPY} ${CLAIR3} SortVcf \ diff --git a/shared/param_p.py b/shared/param_p.py index 26bf3a5..3dfad5f 100644 --- a/shared/param_p.py +++ b/shared/param_p.py @@ -22,7 +22,7 @@ tensorflow_threads = 4 #GVCF parameters -base_err = 0.001 +base_err = 0.05 gq_bin_size = 5 #Pileup input feature list diff --git a/shared/utils.py b/shared/utils.py index 730ef0f..6b42fbf 100644 --- a/shared/utils.py +++ b/shared/utils.py @@ -21,6 +21,22 @@ # D->A or G or T # H->A or C or T # V->A or C or G + +def convert_iupac_to_n(string): + if string == ".": + return string + output_str = [] + not_acgt_count = 0 + for s in string: + if s.upper() not in "ACGTN,.": + not_acgt_count += 1 + output_str.append('N') + else: + output_str.append(s) + if not_acgt_count == 0: + return string + return ''.join(output_str) + IUPAC_base_to_ACGT_base_dict = dict(zip( "ACGTURYSWKMBDHVN", ("A", "C", "G", "T", "T", "A", "C", "C", "A", "G", "A", "C", "A", "A", "A", "A") diff --git a/trio/CallVarBam_Denovo.py b/trio/CallVarBam_Denovo.py index a459bb3..5bf89ac 100644 --- a/trio/CallVarBam_Denovo.py +++ b/trio/CallVarBam_Denovo.py @@ -122,6 +122,7 @@ def Run(args): # fast_mode = CommandOption('fast_mode', args.fast_mode) # call_snp_only_mode = CommandOption('call_snp_only', args.call_snp_only) gvcf_mode = CommandOption('gvcf', args.gvcf) + keep_iupac_bases_mode = CommandOption('keep_iupac_bases', args.keep_iupac_bases) ctgStart = None ctgEnd = None @@ -198,6 +199,7 @@ def Run(args): chunk_id, chunk_num, gvcf_mode, + keep_iupac_bases_mode, ] #print(' '.join(shlex.split(command_string_from(create_tensor_command_options)))) #print(' '.join(shlex.split(command_string_from(call_variant_command_options)))) @@ -308,6 +310,9 @@ def main(): parser.add_argument('--max_depth', type=int, default=144, help="EXPERIMENTAL: Maximum full alignment depth to be processed. default: %(default)s") + parser.add_argument('--keep_iupac_bases', type=str2bool, default=False, + help="EXPERIMENTAL: Keep IUPAC (non ACGTN) reference and alternate bases, default: convert all IUPAC bases to N") + # options for debug purpose parser.add_argument('--phasing_info_in_bam', action='store_true', diff --git a/trio/CallVariants_Denovo.py b/trio/CallVariants_Denovo.py index 04078af..ad8aa8c 100644 --- a/trio/CallVariants_Denovo.py +++ b/trio/CallVariants_Denovo.py @@ -20,7 +20,8 @@ #import clair3.utils as utils import trio.utils as utils from clair3.task.genotype import Genotype, genotype_string_from, genotype_enum_from, genotype_enum_for_task -from shared.utils import IUPAC_base_to_ACGT_base_dict as BASE2ACGT, BASIC_BASES, str2bool, file_path_from, log_error, log_warning +from shared.utils import IUPAC_base_to_ACGT_base_dict as BASE2ACGT, BASIC_BASES, str2bool, file_path_from, \ + log_error, log_warning, convert_iupac_to_n from clair3.task.variant_length import VariantLength import trio.model as model_path @@ -49,7 +50,8 @@ 'gvcf', 'pileup', 'trio_n_id', - 'model_arc' + 'model_arc', + 'keep_iupac_bases' ]) OutputUtilities = namedtuple('OutputUtilities', [ 'print_debug_message', @@ -203,7 +205,8 @@ def Run(args): gvcf=args.gvcf, pileup=args.pileup, trio_n_id=args.trio_n_id, - model_arc=args.model_arc + model_arc=args.model_arc, + keep_iupac_bases=args.keep_iupac_bases, ) output_utilities = output_utilties_from( sample_name=args.sampleName, @@ -247,7 +250,8 @@ def Run_call_trio(args): gvcf=args.gvcf, pileup=args.pileup, trio_n_id=0, - model_arc=args.model_arc + model_arc=args.model_arc, + keep_iupac_bases=args.keep_iupac_bases, ) #output_utilities_c = output_utilties_from( output_utilities_c = output_utilties_class( @@ -275,7 +279,8 @@ def Run_call_trio(args): gvcf=args.gvcf, pileup=args.pileup, trio_n_id=1, - model_arc=args.model_arc + model_arc=args.model_arc, + keep_iupac_bases=args.keep_iupac_bases, ) #output_utilities_p1 = output_utilties_from( output_utilities_p1 = output_utilties_class( @@ -302,7 +307,8 @@ def Run_call_trio(args): gvcf=args.gvcf, pileup=args.pileup, trio_n_id=2, - model_arc=args.model_arc + model_arc=args.model_arc, + keep_iupac_bases=args.keep_iupac_bases, ) #output_utilities_p2 = output_utilties_from( @@ -1430,6 +1436,10 @@ def decode_alt_info(alt_info_dict): is_reference=is_reference ) + if not output_config.keep_iupac_bases: + reference_base = convert_iupac_to_n(reference_base) + alternate_base = convert_iupac_to_n(alternate_base) + if output_config.is_debug: output_utilities.print_debug_message( chromosome, @@ -1931,6 +1941,9 @@ def main(): parser.add_argument('--haploid_sensitive', action='store_true', help="EXPERIMENTAL: Enable haploid calling mode. 0/1 and 1/1 are considered as a variant") + parser.add_argument('--keep_iupac_bases', type=str2bool, default=False, + help="EXPERIMENTAL: Keep IUPAC (non ACGTN) reference and alternate bases, default: convert all IUPAC bases to N") + # options for debug purpose parser.add_argument('--use_gpu', type=str2bool, default=False, help="DEBUG: Use GPU for calling. Speed up is mostly insignficiant. Only use this for building your own pipeline") diff --git a/trio/Call_Clair3_Nova.sh b/trio/Call_Clair3_Nova.sh index 2c1bc52..8dda84f 100755 --- a/trio/Call_Clair3_Nova.sh +++ b/trio/Call_Clair3_Nova.sh @@ -7,7 +7,7 @@ ARGS=`getopt -o f:t:p:o:r::c::s::h::g \ -l bam_fn_c:,bam_fn_p1:,bam_fn_p2:,ref_fn:,threads:,model_path_clair3:,model_path_clair3_nova:,platform:,output:,\ bed_fn::,vcf_fn::,ctg_name::,sample_name_c::,sample_name_p1::,sample_name_p2::,help::,qual::,samtools::,python::,pypy::,parallel::,whatshap::,chunk_num::,chunk_size::,var_pct_full::,var_pct_phasing::,\ snp_min_af::,indel_min_af::,ref_pct_full::,pileup_only::,pileup_phasing::,fast_mode::,gvcf::,print_ref_calls::,haploid_precise::,haploid_sensitive::,include_all_ctgs::,\ -no_phasing_for_fa::,pileup_model_prefix::,nova_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_output_haplotagging::,enable_long_indel:: -n 'run_clair3_nova.sh' -- "$@"` +no_phasing_for_fa::,pileup_model_prefix::,nova_model_prefix::,call_snp_only::,remove_intermediate_dir::,enable_phasing::,enable_output_haplotagging::,keep_iupac_bases::,base_err::,gq_bin_size::,enable_long_indel:: -n 'run_clair3_nova.sh' -- "$@"` if [ $? != 0 ] ; then echo"No input. Terminating...">&2 ; exit 1 ; fi eval set -- "${ARGS}" @@ -49,6 +49,9 @@ RM_TMP_DIR=False ENABLE_PHASING=False ENABLE_LONG_INDEL=False ENABLE_OUTPUT_HAPLOTAGGING=False +BASE_ERR=0.001 +GQ_BIN_SIZE=5 +KEEP_IUPAC_BASES=False while true; do @@ -96,6 +99,9 @@ while true; do --remove_intermediate_dir ) RM_TMP_DIR="$2"; shift 2 ;; --enable_long_indel ) ENABLE_LONG_INDEL="$2"; shift 2 ;; --enable_phasing ) ENABLE_PHASING="$2"; shift 2 ;; + --base_err ) BASE_ERR="$2"; shift 2 ;; + --gq_bin_size ) GQ_BIN_SIZE="$2"; shift 2 ;; + --keep_iupac_bases ) KEEP_IUPAC_BASES="$2"; shift 2 ;; --enable_output_haplotagging ) ENABLE_OUTPUT_HAPLOTAGGING="$2"; shift 2 ;; -- ) shift; break; ;; @@ -234,6 +240,9 @@ ${PARALLEL} --retries ${RETRIES} -j3 --joblog ${LOG_PATH}/parallel_1_clair3_pil --enable_phasing=${ENABLE_PHASING} \ --enable_long_indel=${ENABLE_LONG_INDEL} \ --pileup_model_prefix=${PILEUP_PREFIX} \ + --base_err=${BASE_ERR} \ + --gq_bin_size=${GQ_BIN_SIZE} \ + --keep_iupac_bases=${KEEP_IUPAC_BASES} \ --fa_model_prefix=full_alignment" ::: ${ALL_SAMPLE[@]} :::+ ${ALL_UNPHASED_BAM_FILE_PATH[@]} |& tee ${LOG_PATH}/1_call_var_bam_pileup.log @@ -303,6 +312,7 @@ time ${PARALLEL} --retries ${RETRIES} --joblog ${LOG_PATH}/parallel_3_callvarbam --gvcf=${GVCF} \ --showRef ${SHOW_REF} \ --tmp_path ${OUTPUT_FOLDER}/tmp \ +--keep_iupac_bases=${KEEP_IUPAC_BASES} \ --phasing_info_in_bam" :::: ${CANDIDATE_BED_PATH}/FULL_ALN_FILES |& tee ${LOG_PATH}/3_CV.log ${PARALLEL} -j${THREADS} \ diff --git a/trio/param_t.py b/trio/param_t.py index de858ff..975d7f3 100644 --- a/trio/param_t.py +++ b/trio/param_t.py @@ -1,5 +1,5 @@ TOOL_NAME = "clair3_nova" -VERSION='v0.2' +VERSION='v0.3' from itertools import accumulate