Merge branch 'main' into functionmotifs

PNNL-CompBio · Dec 14, 2023 · 46b526c · 46b526c
2 parents 8de742d + 94a1374
commit 46b526c
Show file tree

Hide file tree

Showing 40 changed files with 1,452 additions and 365,790 deletions.
diff --git a/.github/workflows/action.yml b/.github/workflows/action.yml
@@ -29,6 +29,10 @@ jobs:
     needs:
       - Format
 
+    permissions:
+     contents: 'read'
+     id-token: 'write'
+
     steps:
       - uses: actions/checkout@v3
 

diff --git a/docs/source/getting_started/config.rst b/docs/source/getting_started/config.rst
@@ -116,10 +116,18 @@ The following parameters are required in your config file for `snekmer search`.
 Learn/Apply Parameters
 ````````````````
 
-General parameters related to Snekmer's learn and apply mode (``snekmer learn``), wherein supervised models are trained via the workflow.
+General parameters related to Snekmer's learn and apply mode (``snekmer learn``, ``snekmer apply``) , wherein supervised models are trained via the workflow.
 
 =============================  =====================  =========================================================================
      Parameter                    Type                 Description
 =============================  =====================  =========================================================================
- ``save_apply_associations``     ``int``               Save optional output files containing all generated cosine similarity scores.
+ ``save_apply_associations``     ``bool``              Save large optional output files containing all generated cosine similarity scores.
+ ``conf_weight_modifier``        ``int``               Weighting modifer for updating confidence when adding data to an existing kmer count matrix.
+ ``fragmentation``               ``bool``              Option to fragment training data with multiple sub-options listed below.
+ ``version``                     ``str``               Choose 'absolute' or 'percent'. An absolute length of 50 would be 50 amino acids long.
+ ``frag_length``                 ``int``               Length of fragment. Depending on "version", this is a percent or absolute length.
+ ``min_length``                  ``int``               Minimum length of fragment that should be retained. Values less than this are discarded.
+ ``location``                    ``str``               Choose 'start', 'end', or 'random'. This is where on a sequence a fragment is taken from.
+ ``seed``                        ``int``               Choose any (random) seed for reproducible fragmentation.
 =============================  =====================  =========================================================================
+
diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst
@@ -193,18 +193,27 @@ and directories in addition to the files described previously.
 
     .
     ├── output/
-    │   ├── ...
+    │   ├── kmerize/
+    │   │   ├── A.kmers  # kmer labels for A
+    │   │   └── B.kmers  # kmer labels for B
+    │   ├── vector/
+    │   │   ├── A.npz    # sequences, sequence IDs, and kmer vectors for A
+    │   │   └── B.npz    # sequences, sequence IDs, and kmer vectors for B
+    │   ├── vector_frag/ 
+    │   │   ├── A.npz    # Conditional output for vector when the fragmentation option is True.
+    │   │   └── B.npz    # Conditional output for vector when the fragmentation option is True.
     │   ├── learn/
     │   │   ├── kmer-counts-A.csv    # Kmer Counts matrix for A seqs
     │   │   ├── kmer-counts-B.csv     # Kmer Counts matrix for B seqs
     │   │   └── kmer-counts-total.csv    # Kmer Counts matrix for merged (total) database.
     │   ├── eval_apply/
     │   │   ├── Seq-Annotation-Scores-A.model     # Self-assessed sequence-annotation cosine similarity scores for A seqs
     │   │   ├── Seq-Annotation-Scores-B.model     # Self-assessed sequence-annotation cosine similarity scores for B seqs
+    │   ├── eval_apply_frag/
+    │   │   ├── Seq-Annotation-Scores-A.model     # Conditional output for eval_apply when the fragmentation option is True.
+    │   │   ├── Seq-Annotation-Scores-B.model     # Conditional output for eval_apply when the fragmentation option is True.
     │   ├── eval_conf/
     │   │   ├── global-confidence-scores.csv     # Global confidence score distribution
-    │   │   ├── true_total.csv    # Global confidence score True Pos distribution
-    │   │   ├── false_total.csv    # Global confidence score False Pos distribution
     │   │   └── confidence_matrix.csv   # Confidence distribution Matrix for each annotation
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
 biopython
 matplotlib
-numpy >= 1.22.3
+numpy == 1.22.3
 numba >= 0.56
-pandas
+pandas == 1.4.2
 seaborn
 scipy
 scikit-learn
@@ -11,3 +11,4 @@ scikit-learn
 tabulate == 0.8.10
 umap-learn
 hdbscan
+pyarrow
diff --git a/...learnapp_tutorial_files/apply/config.yaml → resources/LA_config.yaml b/...learnapp_tutorial_files/apply/config.yaml → resources/LA_config.yaml
@@ -45,5 +45,10 @@ score_dir: "/path/to/output/scoring/"
 # learnapp params 
 learnapp:
   save_apply_associations: False
-  save_results: True
-
+  fragmentation: False
+  version: absolute
+  frag_length: 50
+  min_length: 50
+  location: random
+  seed: 999
+  conf_weight_modifier: 20
diff --git a/...learn/annotations/TIGRFAMs_annotation.ann → ...urces/annotations/TIGRFAMs_annotation.ann b/...learn/annotations/TIGRFAMs_annotation.ann → ...urces/annotations/TIGRFAMs_annotation.ann
diff --git a/resources/kmer-association-example/config.yaml b/resources/kmer-association-example/config.yaml
@@ -1,9 +1,9 @@
 # required params
-k: 14
-alphabet: 0 # choices 0-5 or names (see alphabet module), or None
+k: 8
+alphabet: 2 # choices 0-5 or names (see alphabet module), or None
 
 # input/output params
-input_dir: None # defaults to 'input'
+input_dir: input # defaults to 'input'
 input_file_exts: ["fasta", "fna", "faa", "fa"] # specify valid input file extensions
 input_file_regex: ".*" # regex to parse family from filename
 nested_output: False # if True, saves into {save_dir}/{alphabet name}/{k}
@@ -41,3 +41,13 @@ model:
 model_dir: "/path/to/output/model/"
 basis_dir: "/path/to/output/kmerize/"
 score_dir: "/path/to/output/scoring/"
+
+# learnapp params 
+learnapp:
+  save_apply_associations: False
+  fragmentation: True
+  version: absolute
+  frag_length: 50
+  min_length: 50
+  location: random
+  seed: 999