-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparams.yaml
106 lines (105 loc) · 6.16 KB
/
params.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# parameters for execution of the pipeline, handled by dvc
# execution of pipeline steps is dependant on these and data dependancy of the steps
#########################################################################
get_raw_data_taxa:
min_16s_len: 1300 # min number of nucleotides in 16s to keep
max_16s_len: 1600 # max number of nucleotides in 16s to keep
get_raw_data_proteins:
dev_only_one_uniprot_file: false
get_raw_data_HMM:
dev_only_one_HMM_file: false
get_raw_data_Hait:
dev_only_one_Hait_file: false
parse_proteins:
max_prot_per_file: 100000 # size of parquet files containing proteins
dev_only_one_uniprot_file: false
label_taxa:
ogt_threshold: 40.0 # binary threshold to consider something as thermophilic
get_16s_blast_scores:
num_threads: 20 # number of cpus to run in parallel
word_size: 28 # size of words for initial exact matching of nucleotides
gapopen_penalty: 2 # cost of bit score to open a gap
gapextend_penalty: 1 # cost of bit score to extend gap
reward: 1 # reward for nucleotide match
penalty: -2 # cost of nucleotide mismatch
ungapped: false # whether to do ungapped alignment
dev_n_sample: null
blast_metrics: # list of methods of learn2therm.blast.BlastMetics to record for aligned hits of 16s rRNA sequence
- local_gap_compressed_percent_id
- scaled_local_query_percent_id
- scaled_local_symmetric_percent_id
- local_E_value
- query_align_start
- query_align_end
- subject_align_end
- subject_align_start
- query_align_len
- query_align_cov
- subject_align_len
- subject_align_cov
- bit_score
label_all_pairs:
blast_metric_thresholds: # define which metrics to use to label a taxa pair
local_gap_compressed_percent_id: # metric name
thresh: 0.81 # float threshold
greater: true # greater than threshold is kept
query_align_cov:
thresh: 0.985
greater: true
subject_align_cov:
thresh: 0.985
greater: true
dev_only_one_file: false
get_protein_blast_scores:
dask_cluster_class: 'SLURMCluster' # dask cluster class used to distribute workers. Update .config/dask as well
max_protein_length: 250 # only consider proteins with length lte number
method: 'diamond' # which aligher (blast or diamond) to use
n_jobs: 80 # number of parallel dask workers
save_frequency: 20000 # how many taxa pairs to finish aligning between dvc checkpoints
dev_sample_pairs: null
method_blast_params: # see https://biopython.org/docs/1.76/api/Bio.Blast.Applications.html
num_threads: 6 # number of threads to run on alignment worker
word_size: 3 # size of words for initial exact matching of nucleotides
gapopen: 11 # cost ofg bit score to open a gap
gapextend: 1 # cost of bit score to extend gap
matrix: 'BLOSUM62' # scoring matrix for amino acid comparisons in alignment
threshold: 11 # minumum score for word to be added to blast lookup table
ungapped: false # whether to do ungapped alignment
evalue: .00001 # maximum evalue to keep alignment
qcov_hsp_perc: 75 # minimum percent query coverage
method_diamond_params: # see https://github.com/bbuchfink/diamond
num_threads: 6 # number of threads to run on alignment worker
sensitivity: 'ultra-sensitive' # the initial mathcing filters that diamond uses are more stringent with less sensitivity
iterate: false # start with low sensitivity and only go lower towards target sensitivity for failure
global_ranking: null # hard limit on the number of Smith Waterman extensions that will be computed for each query.
gapopen: 11 # cost of bit score to open a gap
gapextend: 1 # cost of bit score to extend a gap
matrix: 'BLOSUM62' # scoring matrix for amino acid comparisons in alignment
evalue: .00001 # maximum evalue to keep alignment
hsp_cov: 75 # minumum coverage of both strands
blast_metrics: # list of methods of learn2therm.blast.BlastMetics to record for aligned hits in protein alignment
- local_gap_compressed_percent_id
- scaled_local_query_percent_id
- scaled_local_symmetric_percent_id
- local_E_value
- query_align_start
- query_align_end
- subject_align_end
- subject_align_start
- query_align_len
- query_align_cov
- subject_align_len
- subject_align_cov
- bit_score
run_hmmer: # validation step
e_value: 1.e-10 # maximum evalue to mark protein with HMM match
chunk_size: 2000 # vector size from duckdb to get proteins in chunks and run hmmer on them
prefetch: true # whether to load hmms into memory or leave as disk iterator
njobs: 32 # number of cpus for pyhmmer
scan: false # hmmscan or hmmsearch
jaccard_threshold: 0.79 # labels protein pairs with booleans, if greater than this score, not discussed in paper
dev_sample_data: false
sample_data_for_structure:
metrics: # list of queries to make of protein pairs table, these metrics are smapled uniformly for structural alignment
- "(query_align_cov+subject_align_cov)/2.0"
sample_size: 10000 # number of protein pairs to run alignment on