Skip to content

Commit

Permalink
vsnp3 version 3.26
Browse files Browse the repository at this point in the history
  • Loading branch information
stuber committed Sep 25, 2024
1 parent 0974549 commit 902d5a9
Show file tree
Hide file tree
Showing 27 changed files with 58 additions and 44 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ This step combines the VCF files from Step 1 to create SNP matrices and construc
# Installation

```bash
conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.25
conda create -c conda-forge -c bioconda -n vsnp3 vsnp3=3.26
```

For detailed Miniconda setup instructions, see [conda instructions](./docs/instructions/conda_instructions.md).
Expand Down Expand Up @@ -178,6 +178,9 @@ For detailed usage of each script, use the `-h` option.

For information on additional tools, see [Additional Tools](./docs/instructions/additional_tools.md).

## Archived Detail:
Archived vSNP detail is [here](https://github.com/USDA-VS/vSNP/blob/master/docs/detailed_usage.md)

##

For more information or support, please open an [issue on the GitHub](https://github.com/USDA-VS/vSNP3/issues) or [email](mailto:tod.p.stuber@usda.gov) directly.
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_alignment_vcf.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_annotation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import shutil
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_assembly.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_best_reference_sourmash.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_bruc_mlst.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import io
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_download_GCA_fasta_get_metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_download_fasta_gbk_gff_by_acc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import argparse
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_excel_merge_defining_snps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import re
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_fasta_to_fastq.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

__version__ = "3.25"
__version__ = "3.26"

import gzip
import os
Expand Down
39 changes: 24 additions & 15 deletions bin/vsnp3_fasta_to_snps_table.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import subprocess
Expand Down Expand Up @@ -141,12 +141,13 @@ def get_parsimonious_pos(self, in_df):

class Tables:

def __init__(self, fasta_alignments=None, df_alignments=None, tree=None, gbk=None, mq=None, write_path=None, groupings_dict=None, table_name=None, debug=False,):
def __init__(self, fasta_alignments=None, df_alignments=None, tree=None, gbk=None, mq=None, write_path=None, groupings_dict=None, show_groups=False, table_name=None, debug=False,):
self.fasta_alignments = fasta_alignments
self.df_alignments = df_alignments
self.tree = tree
self.gbk = gbk
self.mq = mq
self.show_groups = show_groups
self.debug = debug
self.st = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d_%H-%M-%S')
self.groupings_dict = groupings_dict
Expand Down Expand Up @@ -289,10 +290,11 @@ def write_out_table(self, df, table_type=None):
# df = df.append(pd.Series(name='no annotations'))
df = pd.concat([df, pd.Series(name='no annotations').to_frame().T])

# Join the list items in the dictionary values into single strings
joined_data = {key: '; '.join(map(str, value)) for key, value in self.group_vcfs_dict.items()}
# Make groupings into a new Series from the dictionary of sample names: [list group names]
new_series = pd.Series(joined_data)
if self.show_groups:
# Join the list items in the dictionary values into single strings
joined_data = {key: '; '.join(map(str, value)) for key, value in self.group_vcfs_dict.items()}
# Make groupings into a new Series from the dictionary of sample names: [list group names]
new_series = pd.Series(joined_data)

# Check if 'Grouping' column already exists
# if 'Grouping' not in df.columns:
Expand All @@ -309,7 +311,7 @@ def write_out_table(self, df, table_type=None):
# print(f'{column_count} columns > {max_size}, cascade table break {count}')
chunck_end += max_size
df_split = df.iloc[:, chunk_start:chunck_end]
if 'Grouping' not in df.columns:
if 'Grouping' not in df.columns and self.show_groups:
df_split.insert(0, 'Grouping', new_series)
df_split.to_json(f'{self.write_path}/df{count}.json', orient='split')
self.excel_formatter(f'{self.write_path}/df{count}.json', f'{self.write_path}/{self.table_name}_{table_type}_table{count}-{self.st}.xlsx')
Expand All @@ -319,14 +321,14 @@ def write_out_table(self, df, table_type=None):
count += 1
# print(f'Last break {column_count} columns, cascade table break {count}')
df_split = df.iloc[:, chunk_start:]
if 'Grouping' not in df.columns:
if 'Grouping' not in df.columns and self.show_groups:
df_split.insert(0, 'Grouping', new_series)
df_split.to_json(f'{self.write_path}/df{count}.json', orient='split')
self.excel_formatter(f'{self.write_path}/df{count}.json', f'{self.write_path}/{self.table_name}_{table_type}_table{count}-{self.st}.xlsx')
os.remove(f'{self.write_path}/df{count}.json')
else: # no break needed
# Insert the new column at position 1 (right after the sample names column)
if 'Grouping' not in df.columns:
if 'Grouping' not in df.columns and self.show_groups:
df.insert(0, 'Grouping', new_series)
df.to_json(f'{self.write_path}/df.json', orient='split')
self.excel_formatter(f'{self.write_path}/df.json', f'{self.write_path}/{self.table_name}_{table_type}_table-{self.st}.xlsx')
Expand Down Expand Up @@ -354,20 +356,26 @@ def excel_formatter(self, df_json, write_to, group=None):
formatN = wb.add_format({'bg_color': '#E2CFDD'})
rows, cols = table_df.shape

#'first_row', 'first_col', 'last_row', and 'last_col'
# Careful that row/column locations don't overlap
if self.show_groups:
start_col = 2
else:
start_col = 1

ws.set_column(0, 0, 30)
ws.set_column(1, cols, 2.1)
ws.freeze_panes(2, 2)
ws.freeze_panes(2, start_col)
formatannotation = wb.add_format({'font_color': '#0A028C', 'rotation': '-90', 'align': 'top'})
#set last row
ws.set_row(rows + 1, cols + 1, formatannotation)

#'first_row', 'first_col', 'last_row', and 'last_col'
# Careful that row/column locations don't overlap
start_col = 2 # This is column C
end_col = cols

ws.conditional_format(rows - 2, start_col, rows - 1, end_col, {'type': 'cell', 'criteria': '<', 'value': 55, 'format': formatlowqual})
ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'cell', 'criteria': '==', 'value': 'C$2', 'format': formatnormal})
if self.show_groups:
ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'cell', 'criteria': '==', 'value': 'C$2', 'format': formatnormal})
else:
ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'cell', 'criteria': '==', 'value': 'B$2', 'format': formatnormal})
ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'text', 'criteria': 'containing', 'value': 'A', 'format': formatA})
ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'text', 'criteria': 'containing', 'value': 'G', 'format': formatG})
ws.conditional_format(2, start_col, rows - 2, end_col, {'type': 'text', 'criteria': 'containing', 'value': 'C', 'format': formatC})
Expand Down Expand Up @@ -506,6 +514,7 @@ def __init__(self, fasta_alignments=None, debug=False,):

parser.add_argument('-f', '--fasta', action='store', dest='fasta', required=True, help='Provide an alignment file in FASTA format')
parser.add_argument('-p', '--parsimonious', action='store_true', dest='parsimonious', help='Only keep parsimonious SNPs from FASTA alignment file. This is different than the uninformative SNPs removed via vSNP pipeline. This is to be used when just working with an aligned FASTA file.')
parser.add_argument('--show_groups', action='store_true', dest='show_groups', help='Show group names in SNP table')
parser.add_argument('-n', '--hash_names', action='store_true', dest='hash_names', help='Hash FASTA names to rid of any RAxML illegal characters')
parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='Optional: Keep debugging files and run without pooling')
parser.add_argument('-v', '--version', action='version', version=f'{os.path.abspath(__file__)}: version {__version__}')
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_fastq_stats_seqkit.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import subprocess
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_file_setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import shutil
Expand Down
7 changes: 4 additions & 3 deletions bin/vsnp3_group_on_defining_snps.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import sys
Expand Down Expand Up @@ -41,7 +41,7 @@ class bcolors:
class Group():
'''
'''
def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, defining_snps=None, dataframes=None, pickle_file=None, abs_pos=None, group=None, all_vcf=None, find_new_filters=None, no_filters=True, qual_threshold=150, n_threshold=50, mq_threshold=56, hash_groups=None, debug=False):
def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, defining_snps=None, dataframes=None, pickle_file=None, abs_pos=None, group=None, all_vcf=None, find_new_filters=None, no_filters=True, qual_threshold=150, n_threshold=50, mq_threshold=56, show_groups=False, hash_groups=None, debug=False):

self.qual_threshold = qual_threshold
self.n_threshold = n_threshold
Expand All @@ -50,6 +50,7 @@ def __init__(self, cwd=None, metadata=None, excel_remove=None, gbk_list=None, de
self.vcf_bad_list=[]
filter_all_list=None
defining_snps_dict = None
self.show_groups = show_groups
self.debug = debug

if cwd == None:
Expand Down Expand Up @@ -560,7 +561,7 @@ def sort_df(self, df):

def raxml_table_build(self, group):
tree = Tree(fasta_alignments=self.group_fasta_dict[group], write_path=f'{self.cwd}/{group}', tree_name=group)
tables = Tables(df_alignments=self.group_dataframe_dict[group], tree=tree.newick, gbk=self.annotation_df, mq=self.average_mq_df, write_path=f'{self.cwd}/{group}', groupings_dict=self.groupings_dict, table_name=group, debug=False)
tables = Tables(df_alignments=self.group_dataframe_dict[group], tree=tree.newick, gbk=self.annotation_df, mq=self.average_mq_df, write_path=f'{self.cwd}/{group}', groupings_dict=self.groupings_dict, show_groups=self.show_groups, table_name=group, debug=False)
tables.build_tables()
self.raxml_version = tree.raxml_version

Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_group_reporter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import io
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_html_step2_summary.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os

Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_kernel_plots.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import re
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_path_adder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import glob
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_reference_options.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_remove_from_analysis.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import sys
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_spoligotype.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import gzip
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_step1.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import sys
Expand Down
5 changes: 3 additions & 2 deletions bin/vsnp3_step2.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import sys
Expand Down Expand Up @@ -315,6 +315,7 @@ def __init__(self, runtime=None, vcf_to_df=None, reference=None, groupings_dict=
parser.add_argument('-abs_pos', '--abs_pos', action='store', dest='abs_pos', required=False, help='Optional: Make a group on defining SNP. Must be supplied with --group option. Format as chrom in VCF, chrom:10000.')
parser.add_argument('-group', '--group', action='store', dest='group', required=False, help='Optional: Name a group on defining SNP. Must be supplied with --abs_pos option')
parser.add_argument('-hash', '--hash_groups', action='store_true', dest='hash_groups', required=False, help='Optional: The option will run defining snps marked with a # in the defining snps file. The # is removed and the defining snps are run.')
parser.add_argument('--show_groups', action='store_true', dest='show_groups', help='Show group names in SNP table')
parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='Optional: Keep debugging files and run without pooling. A pickle file will be kept for troubleshooting to be used directly in vsnp3_group_on_defining_snps.py. This saves processing time')
parser.add_argument('-v', '--version', action='version', version=f'{os.path.basename(__file__)}: version {__version__}')
args = parser.parse_args()
Expand Down Expand Up @@ -420,7 +421,7 @@ def zipit(src, dst):
shutil.copy(args.defining_snps, starting_files) #package with starting files for the record
zipit(starting_files, starting_files) # zip starting files directory

group = Group(cwd=global_working_dir, metadata=args.metadata, defining_snps=args.defining_snps, excel_remove=args.remove_by_name, gbk_list=args.gbk, dataframes=vcf_to_df.dataframes, all_vcf=args.all_vcf, find_new_filters=args.find_new_filters, no_filters=args.no_filters, qual_threshold=int(args.qual_threshold), n_threshold=int(args.n_threshold), mq_threshold=int(args.mq_threshold), abs_pos=args.abs_pos, group=args.group, hash_groups=args.hash_groups, debug=args.debug)
group = Group(cwd=global_working_dir, metadata=args.metadata, defining_snps=args.defining_snps, excel_remove=args.remove_by_name, gbk_list=args.gbk, dataframes=vcf_to_df.dataframes, all_vcf=args.all_vcf, find_new_filters=args.find_new_filters, no_filters=args.no_filters, qual_threshold=int(args.qual_threshold), n_threshold=int(args.n_threshold), mq_threshold=int(args.mq_threshold), abs_pos=args.abs_pos, group=args.group, show_groups=args.show_groups, hash_groups=args.hash_groups, debug=args.debug)
vcf_to_df.vcf_bad_list = vcf_to_df.vcf_bad_list + group.vcf_bad_list

setup.print_time()
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_table_compare.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import re
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_vcf_annotation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import re
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_vcf_merge_to_fasta.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import re
Expand Down
2 changes: 1 addition & 1 deletion bin/vsnp3_zero_coverage.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "3.25"
__version__ = "3.26"

import os
import re
Expand Down

0 comments on commit 902d5a9

Please sign in to comment.