From 22f0a02a146b36a99be6f8d433cd6d7492860c3d Mon Sep 17 00:00:00 2001 From: Tara Nitka Date: Thu, 1 Feb 2024 15:34:40 -0800 Subject: [PATCH] Update docs to include motif --- docs/source/getting_started/cli.rst | 6 +- docs/source/getting_started/config.rst | 9 + docs/source/getting_started/usage.rst | 27 +++ docs/source/index.rst | 4 +- .../tutorial/snekmer_motif_tutorial.ipynb | 184 ++++++++++++++++-- 5 files changed, 214 insertions(+), 16 deletions(-) diff --git a/docs/source/getting_started/cli.rst b/docs/source/getting_started/cli.rst index a3fe68d..01f7d29 100644 --- a/docs/source/getting_started/cli.rst +++ b/docs/source/getting_started/cli.rst @@ -15,7 +15,7 @@ For an overview of Snekmer usage, reference the help command (``snekmer --help`` .. code-block:: console $ snekmer --help - usage: snekmer [-h] [-v] {cluster,model,search,learn,apply} ... + usage: snekmer [-h] [-v] {cluster,model,search,learn,apply,motif} ... Snekmer: A tool for kmer-based sequence analysis using amino acid reduction (AAR) @@ -26,7 +26,7 @@ For an overview of Snekmer usage, reference the help command (``snekmer --help`` mode: Snekmer mode - {cluster,model,search,learn,apply} + {cluster,model,search,learn,apply,motif} Tailored references for the individual operation modes can be accessed via ``snekmer {mode} --help``. @@ -49,7 +49,7 @@ files. Snekmer also assumes background files, if any, are stored in is shown below: -Snekmer ``cluster``, ``model``, and ``search`` input +Snekmer ``cluster``, ``model``, ``search``, and ``motif`` input .. code-block:: console diff --git a/docs/source/getting_started/config.rst b/docs/source/getting_started/config.rst index e21b1c5..828f07d 100644 --- a/docs/source/getting_started/config.rst +++ b/docs/source/getting_started/config.rst @@ -131,3 +131,12 @@ General parameters related to Snekmer's learn and apply mode (``snekmer learn``, ``seed`` ``int`` Choose any (random) seed for reproducible fragmentation. ============================= ===================== ========================================================================= + +Motif Parameters +```````````````` +The following parameters are required for Snekmer's motif mode (``snekmer motif``), wherein feature selection is performed to find functionally relevant kmers. + +======================== ===================== ================================================================================== + Parameter Type Description +======================== ===================== ================================================================================== +``n`` ``int`` Number of label permutation and rescoring iterations to run for each input family. \ No newline at end of file diff --git a/docs/source/getting_started/usage.rst b/docs/source/getting_started/usage.rst index 3e07c4a..857b342 100644 --- a/docs/source/getting_started/usage.rst +++ b/docs/source/getting_started/usage.rst @@ -233,3 +233,30 @@ and directories in addition to the files described previously. │ │ ├── Seq-Annotation-Scores-D.csv # (optional) Sequence-annotation cosine similarity scores for D seqs │ │ ├── kmer-summary-C.csv # Results with annotation predictions and confidence for C seqs │ │ └── kmer-summary-D.csv # Results with annotation predictions and confidence for D seqs + +Snekmer Motif Output Files +:::::::::::::::::::::::::: + +Snekmer's motif mode produces the following output files and directories in addition to the files described previously. + +.. code-block:: console + + . + ├── output/ + │ ├── ... + │ ├── motif/ + │ │ ├── kmers/ + │ │ │ ├── A.csv # kmers retained for A after recursive feature elimination + │ │ │ ├── B.csv # kmers retained for B after recursive feature elimination + │ │ ├── preselection/ + │ │ │ ├── A.csv # kmer weights learned for A after recursive feature elimination + │ │ │ ├── B.csv # kmer weights learned for B after recursive feature elimination + │ │ ├── sequences/ + │ │ │ ├── A.csv # Sequence vectors for A using the kmer subset retained after recursive feature elimination + │ │ │ ├── B.csv # Sequence vectors for B using the kmer subset retained after recursive feature elimination + │ │ ├── scores/ + │ │ │ ├── A.csv # kmer weight learned for A on each permute/rescore iteration + │ │ │ ├── B.csv # kmer weight learned for B on each permute/rescore iteration + │ │ ├── p_values/ + │ │ │ ├── A.csv # Tabulated results for A + │ │ │ └── B.csv # Tabulated results for B \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 5126119..6920811 100755 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -19,7 +19,7 @@ sequences to predict the nearest annotation and generate a confidence score. :width: 700 :alt: Snekmer workflow overview -There are 5 operation modes for Snekmer: ``cluster``, ``model``, ``search``, ``learn``, and ``apply``. +There are 6 operation modes for Snekmer: ``cluster``, ``model``, ``search``, ``motif``, ``learn``, and ``apply``. **Cluster mode:** The user supplies files containing sequences in an appropriate format (e.g. FASTA). Snekmer applies the relevant workflow steps and outputs the resulting clustering results in tabular form (.CSV), @@ -34,6 +34,8 @@ displays K-fold cross validation results in the form of figures (AUC ROC and PR and the models they wish to search their sequences against. Snekmer applies the relevant workflow steps and outputs a table for each file containing model annotation probabilities for the given sequences. +**Motif mode:** The user supplies files containing sequences in an appropriate format (e.g. FASTA). Snekmer applies the relevant workflow steps and outputs a table (.csv) for each family, which shows the SVM weight and associated p-value for each kmer. + **Learn mode:** The user supplies files containing sequences in an appropriate format (e.g. FASTA) as well as an annotation file. Snekmer generates a kmer counts matrix with the summed kmer distribution of each annotation recognized from the sequence ID. Snekmer then performs a self-evaluation to assess confidence levels. There are two outputs, a counts matrix, and a global confidence distribution. diff --git a/resources/tutorial/snekmer_motif_tutorial.ipynb b/resources/tutorial/snekmer_motif_tutorial.ipynb index 058ae27..4b0eaa3 100644 --- a/resources/tutorial/snekmer_motif_tutorial.ipynb +++ b/resources/tutorial/snekmer_motif_tutorial.ipynb @@ -84,16 +84,18 @@ "\n", "If you haven't yet run the [Snekmer tutorial](https://snekmer.readthedocs.io/en/latest/tutorial/index.html), you'll need to do so now. This runs Motif (and the original three Snekmer modes) on the demo example files and produces all output files. The tutorial uses the included default configuration parameters to guide the analysis, but the user can modify these parameters if a different configuration set is desired. The tutorial command line instructions are copied below:\n", "\n", + "'''bash\n", "conda activate snekmer\n", " cd resources/tutorial/demo_example\n", " ./run_demo.sh\n", + "```\n", "\n", "Finally, we will initialize some parameters and parse filenames for this demo notebook." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "7228680b-722a-4cdb-ba2b-181bae3e2a72", "metadata": {}, "outputs": [], @@ -111,10 +113,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "cbf3925a-9ade-4b17-b7bb-12f11333f140", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'k': 14, 'alphabet': 0, 'input_file_exts': ['fasta', 'fna', 'faa', 'fa'], 'input_file_regex': '.*', 'nested_output': False, 'score': {'scaler': True, 'scaler_kwargs': {'n': 0.25}, 'labels': 'None', 'lname': 'None'}, 'cluster': {'method': 'agglomerative-jaccard', 'params': {'n_clusters': 'None', 'linkage': 'average', 'distance_threshold': 0.92, 'compute_full_tree': True}, 'cluster_plots': False, 'min_rep': None, 'max_rep': None, 'save_matrix': True, 'dist_thresh': 100}, 'model': {'cv': 5, 'random_state': 'None'}, 'model_dir': 'output/model/', 'basis_dir': 'output/kmerize/', 'score_dir': 'output/score/', 'motif': {'n': 2000}}\n" + ] + } + ], "source": [ "# load config file\n", "with open(os.path.join(\"..\", \"..\", \"resources\", \"config.yaml\"), \"r\") as configfile:\n", @@ -125,10 +135,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "a9fecdc8-7644-40ca-bd28-b381afe92a88", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['TIGR03149', 'nxrA']\n" + ] + } + ], "source": [ "filenames = sorted(\n", " [\n", @@ -168,15 +186,157 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "aba5c403-f3ca-4978-81f4-efcced6b6810", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
kmerreal scorefalse positivesnp
0VSVSSVSVVSVSSV1.000000020000.0000
1SSSVSSSSSSSSSS0.918053020000.0000
2SSSVSVSSSVSVSV0.905275020000.0000
3VSSSSVSSVSSSVS0.905275020000.0000
4SSSSVSSVSSSVSV0.905275020000.0000
..................
1488SSVVSVSSSVSVVS-0.051628198820000.9940
1489VVVSSVSSSVVVVS-0.198998199120000.9955
1490SVVSVSSSVSVVSV-0.101570200020001.0000
1491VSSSSSSSSSVSVV-0.183426200020001.0000
1492SSSSSSSSSVSVVS-0.183426200020001.0000
\n", + "

1493 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " kmer real score false positives n p\n", + "0 VSVSSVSVVSVSSV 1.000000 0 2000 0.0000\n", + "1 SSSVSSSSSSSSSS 0.918053 0 2000 0.0000\n", + "2 SSSVSVSSSVSVSV 0.905275 0 2000 0.0000\n", + "3 VSSSSVSSVSSSVS 0.905275 0 2000 0.0000\n", + "4 SSSSVSSVSSSVSV 0.905275 0 2000 0.0000\n", + "... ... ... ... ... ...\n", + "1488 SSVVSVSSSVSVVS -0.051628 1988 2000 0.9940\n", + "1489 VVVSSVSSSVVVVS -0.198998 1991 2000 0.9955\n", + "1490 SVVSVSSSVSVVSV -0.101570 2000 2000 1.0000\n", + "1491 VSSSSSSSSSVSVV -0.183426 2000 2000 1.0000\n", + "1492 SSSSSSSSSVSVVS -0.183426 2000 2000 1.0000\n", + "\n", + "[1493 rows x 5 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# read motif results\n", "results = pd.read_csv(os.path.join(\"demo_example\", \"output\", \"motif\", \"p_values\", \"nxrA.csv\"))\n", - "results = results.sort_values(by=\"motif\").reset_index(drop=True)\n", - "results[\"motif\"] = results[\"motif\"].astype(str)\n", "results" ] }, @@ -191,9 +351,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "snekmer", "language": "python", - "name": "python3" + "name": "snekmer" }, "language_info": { "codemirror_mode": { @@ -205,7 +365,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.1" + "version": "3.10.5" } }, "nbformat": 4,