From 69745d05d6ee014bc356e07ddd17518774dc80ee Mon Sep 17 00:00:00 2001 From: Tod Stuber Date: Thu, 22 Aug 2024 06:50:31 -0600 Subject: [PATCH] vsnp3 version 3.24 --- README.md | 2 +- bin/vsnp3_alignment_vcf.py | 13 ++++---- bin/vsnp3_annotation.py | 2 +- bin/vsnp3_assembly.py | 2 +- bin/vsnp3_best_reference_sourmash.py | 2 +- bin/vsnp3_bruc_mlst.py | 2 +- bin/vsnp3_download_GCA_fasta_get_metadata.py | 2 +- bin/vsnp3_download_fasta_gbk_gff_by_acc.py | 2 +- bin/vsnp3_excel_merge_defining_snps.py | 2 +- bin/vsnp3_fasta_to_fastq.py | 2 +- bin/vsnp3_fasta_to_snps_table.py | 5 ++-- bin/vsnp3_fastq_stats_seqkit.py | 2 +- bin/vsnp3_file_setup.py | 2 +- bin/vsnp3_group_on_defining_snps.py | 7 +++-- bin/vsnp3_group_reporter.py | 20 ++++++++++--- bin/vsnp3_html_step2_summary.py | 2 +- bin/vsnp3_kernel_plots.py | 2 +- bin/vsnp3_path_adder.py | 2 +- bin/vsnp3_reference_options.py | 2 +- bin/vsnp3_remove_from_analysis.py | 2 +- bin/vsnp3_spoligotype.py | 4 +-- bin/vsnp3_step1.py | 4 +-- bin/vsnp3_step2.py | 7 +++-- bin/vsnp3_table_compare.py | 2 +- bin/vsnp3_vcf_annotation.py | 2 +- bin/vsnp3_vcf_merge_to_fasta.py | 2 +- bin/vsnp3_zero_coverage.py | 31 +++++++++++++++++--- 27 files changed, 86 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index ba70cb4..41342f3 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Tested with Python 3.8 - 3.9. Anaconda [setup](./docs/instructions/conda_instructions.md) ``` -conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.23 +conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.24 ``` ## Installation test diff --git a/bin/vsnp3_alignment_vcf.py b/bin/vsnp3_alignment_vcf.py index bc902f4..f05cd3f 100755 --- a/bin/vsnp3_alignment_vcf.py +++ b/bin/vsnp3_alignment_vcf.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import subprocess @@ -336,7 +336,7 @@ def latex(self, tex, groups=None): print(r'\end{adjustbox}', file=tex) print(r'\begin{adjustbox}{width=1\textwidth}', file=tex) - print(r'\begin{tabular}{ l | l | l | l | l | l }', file=tex) + print(r'\begin{tabular}{ l | l | l | l | l | l | l }', file=tex) print(r'Mapped Paired Reads & Mapped Single Reads & Unmapped Reads & Unmapped Percent & \multicolumn{2}{l}{Unmapped Assembled Contigs} \\', file=tex) print(r'\hline', file=tex) mapped_reads = self.READS_PAIRED + self.READS_SINGLE @@ -346,16 +346,16 @@ def latex(self, tex, groups=None): print(r'\hline', file=tex) print(r'\hline', file=tex) - print(r'Duplicate Paired Reads & Duplicate Single Reads & \multicolumn{4}{l}{Duplicate Percent of Mapped Reads} \\', file=tex) + print(r'Duplicate Paired Reads & Duplicate Single Reads & \multicolumn{5}{l}{Duplicate Percent of Mapped Reads} \\', file=tex) print(r'\hline', file=tex) - print(f'{self.DUPLICATE_PAIR:,} & {self.DUPLICATE_SINGLE:,} & ' + r'\multicolumn{4}{l}{' + f'{(self.DUPLICATION_RATIO*100):,.1f}' + r'\%} \\', file=tex) + print(f'{self.DUPLICATE_PAIR:,} & {self.DUPLICATE_SINGLE:,} & ' + r'\multicolumn{5}{l}{' + f'{(self.DUPLICATION_RATIO*100):,.1f}' + r'\%} \\', file=tex) print(r'\hline', file=tex) print(r'\hline', file=tex) - print(f'BAM File & Reference Length & Genome with Coverage & Average Depth & No Coverage Bases & Quality SNPs \\\\', file=tex) + print(f'BAM File & Reference Length & Genome with Coverage & Average Depth & No Coverage Bases & Ambiguous SNPs & Quality SNPs \\\\', file=tex) print(r'\hline', file=tex) bam = self.zero_coverage.bam.replace('_', '\_') - print(f'{bam} & {self.zero_coverage.reference_length:,} & {(self.zero_coverage.genome_coverage*100):,.2f}\% & {self.zero_coverage.ave_coverage:,.1f}X & {self.zero_coverage.total_zero_coverage:,} & {self.zero_coverage.good_snp_count:,} \\\\', file=tex) + print(f'{bam} & {self.zero_coverage.reference_length:,} & {(self.zero_coverage.genome_coverage*100):,.2f}\% & {self.zero_coverage.ave_coverage:,.1f}X & {self.zero_coverage.total_zero_coverage:,} & {self.zero_coverage.ac1_count:,} & {self.zero_coverage.good_snp_count:,} \\\\', file=tex) print(r'\hline', file=tex) if groups: @@ -386,6 +386,7 @@ def excel(self, excel_dict): excel_dict['Average Depth'] = f'{self.zero_coverage.ave_coverage:,.1f}X' excel_dict['No Coverage Bases'] = f'{self.zero_coverage.total_zero_coverage:,}' excel_dict['Percent Ref with Zero Coverage'] = f'{self.zero_coverage.percent_ref_with_zero_coverage:,.6f}%' + excel_dict['Ambiguous SNPs'] = f'{self.zero_coverage.ac1_count:,}' excel_dict['Quality SNPs'] = f'{self.zero_coverage.good_snp_count:,}' diff --git a/bin/vsnp3_annotation.py b/bin/vsnp3_annotation.py index c5ce601..24470ae 100755 --- a/bin/vsnp3_annotation.py +++ b/bin/vsnp3_annotation.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import shutil diff --git a/bin/vsnp3_assembly.py b/bin/vsnp3_assembly.py index f250684..6325beb 100755 --- a/bin/vsnp3_assembly.py +++ b/bin/vsnp3_assembly.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import sys diff --git a/bin/vsnp3_best_reference_sourmash.py b/bin/vsnp3_best_reference_sourmash.py index 304c05a..6907a10 100755 --- a/bin/vsnp3_best_reference_sourmash.py +++ b/bin/vsnp3_best_reference_sourmash.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import subprocess diff --git a/bin/vsnp3_bruc_mlst.py b/bin/vsnp3_bruc_mlst.py index df56ea1..ed174e1 100755 --- a/bin/vsnp3_bruc_mlst.py +++ b/bin/vsnp3_bruc_mlst.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import io diff --git a/bin/vsnp3_download_GCA_fasta_get_metadata.py b/bin/vsnp3_download_GCA_fasta_get_metadata.py index 5472760..095bf76 100755 --- a/bin/vsnp3_download_GCA_fasta_get_metadata.py +++ b/bin/vsnp3_download_GCA_fasta_get_metadata.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import sys diff --git a/bin/vsnp3_download_fasta_gbk_gff_by_acc.py b/bin/vsnp3_download_fasta_gbk_gff_by_acc.py index 1a9d1e8..ad2c6cf 100755 --- a/bin/vsnp3_download_fasta_gbk_gff_by_acc.py +++ b/bin/vsnp3_download_fasta_gbk_gff_by_acc.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import argparse diff --git a/bin/vsnp3_excel_merge_defining_snps.py b/bin/vsnp3_excel_merge_defining_snps.py index 6cfa255..ff4b2e2 100755 --- a/bin/vsnp3_excel_merge_defining_snps.py +++ b/bin/vsnp3_excel_merge_defining_snps.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import re diff --git a/bin/vsnp3_fasta_to_fastq.py b/bin/vsnp3_fasta_to_fastq.py index ca6dd83..9a2d7c1 100755 --- a/bin/vsnp3_fasta_to_fastq.py +++ b/bin/vsnp3_fasta_to_fastq.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -__version__ = "3.23" +__version__ = "3.24" import gzip import os diff --git a/bin/vsnp3_fasta_to_snps_table.py b/bin/vsnp3_fasta_to_snps_table.py index a91f532..b144b5a 100755 --- a/bin/vsnp3_fasta_to_snps_table.py +++ b/bin/vsnp3_fasta_to_snps_table.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import subprocess @@ -312,7 +312,8 @@ def excel_formatter(self, df_json, write_to, group=None): # sample_path_name = self.sample_path_name st = self.st table_df = pd.read_json(df_json, orient='split') - writer = pd.ExcelWriter(write_to, engine='xlsxwriter') + writer = pd.ExcelWriter(write_to, engine='xlsxwriter',) + writer.book.use_zip64() table_df.to_excel(writer, sheet_name='Sheet1') wb = writer.book ws = writer.sheets['Sheet1'] diff --git a/bin/vsnp3_fastq_stats_seqkit.py b/bin/vsnp3_fastq_stats_seqkit.py index 5694b89..6fb8f3f 100755 --- a/bin/vsnp3_fastq_stats_seqkit.py +++ b/bin/vsnp3_fastq_stats_seqkit.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import subprocess diff --git a/bin/vsnp3_file_setup.py b/bin/vsnp3_file_setup.py index 725e7f5..266eada 100755 --- a/bin/vsnp3_file_setup.py +++ b/bin/vsnp3_file_setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import shutil diff --git a/bin/vsnp3_group_on_defining_snps.py b/bin/vsnp3_group_on_defining_snps.py index ef9d1d2..8ad361e 100755 --- a/bin/vsnp3_group_on_defining_snps.py +++ b/bin/vsnp3_group_on_defining_snps.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import sys @@ -370,8 +370,11 @@ def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, de # print(f'\n\nTotal Time: {datetime.now() - self.beginTime}\n') #Add back those that where a group was not found + if 'Group Not Found' not in groupings_dict: + groupings_dict['Group Not Found'] = {} + for sample in samples_without_group_set: - groupings_dict = {**groupings_dict, 'Group Not Found': {sample: None}} + groupings_dict['Group Not Found'][sample] = pd.DataFrame() self.groupings_dict = groupings_dict # will be passed to html summary def group_selection(self, abs_pos): diff --git a/bin/vsnp3_group_reporter.py b/bin/vsnp3_group_reporter.py index ec784b2..db609ca 100755 --- a/bin/vsnp3_group_reporter.py +++ b/bin/vsnp3_group_reporter.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import io @@ -100,6 +100,7 @@ def bin_and_html_table(self, filename, found_positions, found_positions_mix): defining_snps = self.defining_snps inverted_defining_snps = self.inverted_defining_snps try: + sample_groups_list = [] defining_snp = False for abs_position in list(defining_snps.keys() & (found_positions.keys() | found_positions_mix.keys())): #absolute positions in set union of two list group = defining_snps[abs_position] @@ -112,10 +113,21 @@ def bin_and_html_table(self, filename, found_positions, found_positions_mix): group = inverted_defining_snps[abs_position] sample_groups_list.append(group) defining_snp = True - if defining_snp: - sample_groups_list = sorted(sample_groups_list) - else: + + if defining_snp is False: # extra step to get the group name when there are mutliple defining snps for a group. + for abs_position in list(defining_snps.keys()): + set_abs_position = set(abs_position.split(", ")) + set_found_positions = set(found_positions.keys()) + is_subset = set_abs_position.issubset(set_found_positions) + if is_subset: + group = defining_snps[abs_position] + sample_groups_list.append(group) + + if len(sample_groups_list) == 0: sample_groups_list = ['No defining SNPs'] + else: + sample_groups_list = sorted(sample_groups_list) + except TypeError: message = f'File TypeError' print(f'{message}: {filename}') diff --git a/bin/vsnp3_html_step2_summary.py b/bin/vsnp3_html_step2_summary.py index 3a79b64..e2d8813 100755 --- a/bin/vsnp3_html_step2_summary.py +++ b/bin/vsnp3_html_step2_summary.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os diff --git a/bin/vsnp3_kernel_plots.py b/bin/vsnp3_kernel_plots.py index 51e9735..5c7619b 100755 --- a/bin/vsnp3_kernel_plots.py +++ b/bin/vsnp3_kernel_plots.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import re diff --git a/bin/vsnp3_path_adder.py b/bin/vsnp3_path_adder.py index 9183a4a..f4ce7f3 100755 --- a/bin/vsnp3_path_adder.py +++ b/bin/vsnp3_path_adder.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import glob diff --git a/bin/vsnp3_reference_options.py b/bin/vsnp3_reference_options.py index df0363a..6c2b37e 100755 --- a/bin/vsnp3_reference_options.py +++ b/bin/vsnp3_reference_options.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import sys diff --git a/bin/vsnp3_remove_from_analysis.py b/bin/vsnp3_remove_from_analysis.py index 905bb38..81b081f 100755 --- a/bin/vsnp3_remove_from_analysis.py +++ b/bin/vsnp3_remove_from_analysis.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import sys diff --git a/bin/vsnp3_spoligotype.py b/bin/vsnp3_spoligotype.py index e3f0972..4be6309 100755 --- a/bin/vsnp3_spoligotype.py +++ b/bin/vsnp3_spoligotype.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import gzip @@ -163,7 +163,7 @@ def spoligo(self): count_summary = pull.compute() count_summary = OrderedDict(sorted(count_summary.items())) spoligo_binary_dictionary = {} - self.call_cut_off = 4 + self.call_cut_off = 2 for k, v in count_summary.items(): if v > self.call_cut_off: spoligo_binary_dictionary.update({k: 1}) diff --git a/bin/vsnp3_step1.py b/bin/vsnp3_step1.py index 788502d..b708b73 100755 --- a/bin/vsnp3_step1.py +++ b/bin/vsnp3_step1.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import sys @@ -132,7 +132,7 @@ def run(self,): except AttributeError: pass self.MYCO = MYCO - if MYCO and self.spoligo: + if self.spoligo: spoligo = Spoligo(SAMPLE_NAME=self.sample_name, FASTQ_R1=self.FASTQ_R1, FASTQ_R2=self.FASTQ_R2, debug=self.debug) spoligo.spoligo() spoligo.latex(self.latex_report.tex) diff --git a/bin/vsnp3_step2.py b/bin/vsnp3_step2.py index 52193fc..76524ae 100755 --- a/bin/vsnp3_step2.py +++ b/bin/vsnp3_step2.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import sys @@ -218,7 +218,10 @@ def __init__(self, runtime=None, vcf_to_df=None, reference=None, groupings_dict= print("", file=htmlfile) print(f"{key}", end='\t', file=htmlfile) for group in value: - print(f"{group}", end='\t', file=htmlfile) + if group == "Group Not Found": + print(f'{group}', end='\t', file=htmlfile) + else: + print(f"{group}", end='\t', file=htmlfile) print("", file=htmlfile) print("", file=htmlfile) diff --git a/bin/vsnp3_table_compare.py b/bin/vsnp3_table_compare.py index f65fb8d..ce366a9 100755 --- a/bin/vsnp3_table_compare.py +++ b/bin/vsnp3_table_compare.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import re diff --git a/bin/vsnp3_vcf_annotation.py b/bin/vsnp3_vcf_annotation.py index 7719159..717a0b3 100755 --- a/bin/vsnp3_vcf_annotation.py +++ b/bin/vsnp3_vcf_annotation.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import re diff --git a/bin/vsnp3_vcf_merge_to_fasta.py b/bin/vsnp3_vcf_merge_to_fasta.py index 9334fcd..3e54830 100755 --- a/bin/vsnp3_vcf_merge_to_fasta.py +++ b/bin/vsnp3_vcf_merge_to_fasta.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import re diff --git a/bin/vsnp3_zero_coverage.py b/bin/vsnp3_zero_coverage.py index 82a3980..3707c4d 100755 --- a/bin/vsnp3_zero_coverage.py +++ b/bin/vsnp3_zero_coverage.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -__version__ = "3.23" +__version__ = "3.24" import os import re @@ -22,6 +22,28 @@ class Zero_Coverage(Setup): ''' def __init__(self, FASTA=None, bam=None, vcf=None, debug=False): + def count_ac1_positions(vcf_file): + ac1_count = 0 + + with open(vcf_file, 'r') as f: + for line in f: + # Skip header lines + if line.startswith('#'): + continue + + # Split the line into fields + fields = line.strip().split('\t') + + # Check if INFO field contains AC=1 + info = fields[7] + if 'AC=1' in info.split(';'): + ac1_count += 1 + + return ac1_count + + self.ac1_count = count_ac1_positions(vcf) + print(f"Number of positions with AC=1: {self.ac1_count:,}") + Setup.__init__(self, FASTA=FASTA, debug=debug) self.print_run_time('Zero Coverage') self.sample_name = re.sub('[_.].*', '', bam) @@ -116,11 +138,11 @@ def latex(self, tex): print(r'\end{center}', file=tex) print(r'\end{adjustbox}', file=tex) print(r'\begin{adjustbox}{width=1\textwidth}', file=tex) - print(r'\begin{tabular}{ l | l | l | l | l | l | l }', file=tex) - print(f'BAM File & Reference Length & Genome with Coverage & Average Depth & No Coverage Bases & Quality SNPs \\\\', file=tex) + print(r'\begin{tabular}{ l | l | l | l | l | l | l | l }', file=tex) + print(f'BAM File & Reference Length & Genome with Coverage & Average Depth & No Coverage Bases & Ambiguous SNPs & Quality SNPs \\\\', file=tex) print(r'\hline', file=tex) bam = self.bam.replace('_', '\_') - print(f'{bam} & {self.reference_length:,} & {(self.genome_coverage*100):,.2f}\% & {self.ave_coverage:,.1f}X & {self.total_zero_coverage:,} & {self.good_snp_count:,} \\\\', file=tex) + print(f'{bam} & {self.reference_length:,} & {(self.genome_coverage*100):,.2f}\% & {self.ave_coverage:,.1f}X & {self.total_zero_coverage:,} & {self.ac1_count:,} & {self.good_snp_count:,} \\\\', file=tex) print(r'\hline', file=tex) print(r'\end{adjustbox}', file=tex) print(r'\vspace{0.1 mm}', file=tex) @@ -136,6 +158,7 @@ def excel(self, excel_dict): excel_dict['Average Depth'] = f'{self.ave_coverage:,.1f}' excel_dict['No Coverage Bases'] = f'{self.total_zero_coverage:,}' excel_dict['Percent Ref with Zero Coverage'] = f'{self.percent_ref_with_zero_coverage:,.6f}%' + excel_dict['Ambiguous SNPs'] = f'{self.ac1_count:,}' excel_dict['Quality SNPs'] = f'{self.good_snp_count:,}'