diff --git a/.gitignore b/.gitignore index ea9577c..c1eb46a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -/model \ No newline at end of file +/model +/data \ No newline at end of file diff --git a/notebooks/data_clf_experiments_final.ipynb b/notebooks/data_clf_experiments_final.ipynb index e0da12c..c0003e0 100644 --- a/notebooks/data_clf_experiments_final.ipynb +++ b/notebooks/data_clf_experiments_final.ipynb @@ -3414,7 +3414,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.12" }, "varInspector": { "cols": { diff --git a/notebooks/run classifier.ipynb b/notebooks/run classifier.ipynb index 7753316..a01c51e 100644 --- a/notebooks/run classifier.ipynb +++ b/notebooks/run classifier.ipynb @@ -2,11 +2,34 @@ "cells": [ { "cell_type": "code", - "execution_count": null, - "id": "10a5ec5c", + "execution_count": 14, + "id": "bedcd040", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#in ops environment" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e5301a37", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "import base64" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f69e89a4", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import pipeline " + ] }, { "cell_type": "code", @@ -15,78 +38,222 @@ "metadata": {}, "outputs": [ { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[1;32mC:\\Users\\PAOLA~1.YEL\\AppData\\Local\\Temp/ipykernel_8484/1646925271.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mInstructorEmbedding\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mINSTRUCTOR\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\InstructorEmbedding\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0minstructor\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\InstructorEmbedding\\instructor.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mautonotebook\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mtrange\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 8\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mtorch\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mTensor\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 9\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0msentence_transformers\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mSentenceTransformer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 10\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msentence_transformers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodels\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mTransformer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 11\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mtransformers\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mAutoConfig\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sentence_transformers\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0m__version__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"2.2.2\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[0m__MODEL_HUB_ORGANIZATION__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'sentence-transformers'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mdatasets\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mSentencesDataset\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mParallelSentencesDataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mLoggingHandler\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mLoggingHandler\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mSentenceTransformer\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mSentenceTransformer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sentence_transformers\\datasets\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mDenoisingAutoEncoderDataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mDenoisingAutoEncoderDataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mNoDuplicatesDataLoader\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mNoDuplicatesDataLoader\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mParallelSentencesDataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mParallelSentencesDataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mSentencesDataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mSentencesDataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mSentenceLabelDataset\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mSentenceLabelDataset\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\sentence_transformers\\datasets\\DenoisingAutoEncoderDataset.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreaders\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mInputExample\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mInputExample\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[1;32mimport\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 6\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtokenize\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtreebank\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mTreebankWordDetokenizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\nltk\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 130\u001b[0m \u001b[1;31m# Import top-level functionality into top-level namespace\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 131\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 132\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcollocations\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 133\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdecorators\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mdecorator\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmemoize\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 134\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfeatstruct\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\nltk\\collocations.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 34\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 35\u001b[0m \u001b[1;31m# these two unused imports are referenced in collocations.doctest\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 36\u001b[1;33m from nltk.metrics import (\n\u001b[0m\u001b[0;32m 37\u001b[0m \u001b[0mBigramAssocMeasures\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 38\u001b[0m \u001b[0mContingencyMeasures\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\nltk\\metrics\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 16\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0magreement\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mAnnotationTask\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 17\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmetrics\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0maline\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0malign\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 18\u001b[1;33m from nltk.metrics.association import (\n\u001b[0m\u001b[0;32m 19\u001b[0m \u001b[0mBigramAssocMeasures\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 20\u001b[0m \u001b[0mContingencyMeasures\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\nltk\\metrics\\association.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 25\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 26\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstats\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mfisher_exact\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 27\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 28\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\scipy\\stats\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 439\u001b[0m \"\"\"\n\u001b[0;32m 440\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 441\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mstats\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 442\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mdistributions\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 443\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0mmorestats\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\scipy\\stats\\stats.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 41\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mscipy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mspecial\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mspecial\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 42\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mlinalg\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 43\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mdistributions\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 44\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmstats_basic\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 45\u001b[0m from ._stats_mstats_common import (_find_repeats, linregress, theilslopes,\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\scipy\\stats\\distributions.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;31m# instead of `git blame -Lxxx,+x`.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 7\u001b[0m \u001b[1;31m#\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 8\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_distn_infrastructure\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mrv_discrete\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrv_continuous\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrv_frozen\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 10\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_continuous_distns\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\scipy\\stats\\_distn_infrastructure.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 22\u001b[0m \u001b[1;31m# for root finding for continuous distribution ppf, and max likelihood\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[1;31m# estimation\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 24\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mscipy\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0moptimize\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 25\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[1;31m# for functions of continuous distributions (e.g. moments, entropy, cdf)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\scipy\\optimize\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 399\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 400\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0moptimize\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 401\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_minimize\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 402\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_root\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 403\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_root_scalar\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\scipy\\optimize\\_minimize.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 23\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_trustregion_dogleg\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_minimize_dogleg\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 24\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_trustregion_ncg\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_minimize_trust_ncg\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 25\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_trustregion_krylov\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_minimize_trust_krylov\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 26\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_trustregion_exact\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_minimize_trustregion_exact\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_trustregion_constr\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0m_minimize_trustregion_constr\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\scipy\\optimize\\_trustregion_krylov.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_trustregion\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0m_minimize_trust_region\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_trlib\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mget_trlib_quadratic_subproblem\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[0m__all__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'_minimize_trust_krylov'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\scipy\\optimize\\_trlib\\__init__.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[1;33m.\u001b[0m\u001b[0m_trlib\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mTRLIBQuadraticSubproblem\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[0m__all__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[1;34m'TRLIBQuadraticSubproblem'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'get_trlib_quadratic_subproblem'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", - "\u001b[1;32m~\\Anaconda3\\lib\\importlib\\_bootstrap.py\u001b[0m in \u001b[0;36mparent\u001b[1;34m(self)\u001b[0m\n", - "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Paola.YELA\\Anaconda3\\envs\\ops\\lib\\site-packages\\InstructorEmbedding\\instructor.py:7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", + " from tqdm.autonotebook import trange\n" ] } ], "source": [ - "from InstructorEmbedding import INSTRUCTOR" + "from InstructorEmbedding import INSTRUCTOR #https://pypi.org/project/InstructorEmbedding/" ] }, { "cell_type": "code", - "execution_count": null, - "id": "b5bbe202", + "execution_count": 25, + "id": "29b3150a", "metadata": {}, "outputs": [], + "source": [ + "from deep_parser import TextFromFile # pip install git+https://github.com/the-deep/deepex@newformat " + ] + }, + { + "cell_type": "markdown", + "id": "4ba882c9", + "metadata": {}, + "source": [ + "### 1. Model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b5bbe202", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "load INSTRUCTOR_Transformer\n", + "max_seq_length 512\n" + ] + } + ], "source": [ "emb_model = INSTRUCTOR('hkunlp/instructor-base') #also a large and xl version is available" ] }, { "cell_type": "code", - "execution_count": null, - "id": "30648927", + "execution_count": 8, + "id": "d3b2206c", + "metadata": {}, + "outputs": [], + "source": [ + "clf = pipeline(\"text-classification\", model=\"../model/model2-20230818/\", top_k=3)" + ] + }, + { + "cell_type": "markdown", + "id": "bba1a2b5", + "metadata": {}, + "source": [ + "### 2. Example" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d71bcb4f", "metadata": {}, "outputs": [], "source": [ - "txt = \"There are learnings in this life that you need to get right or wrong, but take\"" + "samples = requests.get('https://goadmin.ifrc.org/api/v2/appeal_document/?search=Emergency+Appeal+Final+Report').json()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d3b2206c", + "execution_count": 21, + "id": "9b16f53c", "metadata": {}, "outputs": [], "source": [ - "# let's lad also out trained classifier\n", - "clf = pipeline(\"text-classification\", model=\"./models/model2-20230818/\", top_k=3)" + "sample = samples['results'][0]['document_url']" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, + "id": "24991815", + "metadata": {}, + "outputs": [], + "source": [ + "req = requests.get(sample)\n", + "document = base64.b64encode(req.content)\n", + "base = TextFromFile(document)\n", + "text, _ = base.extract_text(output_format=\"list\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4ce8c396", + "metadata": {}, + "outputs": [], + "source": [ + "min_length = 20" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "c888d333", + "metadata": {}, + "outputs": [], + "source": [ + "txt = [line for page in text for line in page if len(line.split())>=min_length]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "78624545", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "127" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(txt)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "392e81ec", + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer_kwargs = {\n", + " 'padding': 'max_length',\n", + " 'truncation':True,\n", + " 'max_length': 256, \n", + " 'add_special_tokens': True, \n", + " 'return_token_type_ids':True\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "id": "ef3d7cc9", "metadata": {}, "outputs": [], "source": [ - "classification = clf_model(documents[i], **tokenizer_kwargs)" + "classification = clf(txt, **tokenizer_kwargs)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "19ba0569", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lessons Learnt\n", + "Looking to the future, the Hungarian Red Cross plans to continue responding with migration related activities in the coming months. Currently it is not yet possible to indicate details of activities and locations as the Hungarian government has announced new regulations that will change the locations and the type of facilities. There is a tendency to move towards closed facilities and transit zones, where Hungarian Red Cross access is currently under discussion with the authorities.\n", + "============================\n", + "Lessons Learnt\n", + "Strengthening the skills of the National Society volunteers for the provision of the assistance to the above- mentioned through proper training;\n", + "============================\n", + "Lessons Learnt\n", + "Reinforcing the response capacities of the National Society in order to be prepared to react in case an increase in humanitarian needs takes place in the coming months, a scenario that considers the fluctuations in the migration\n", + "============================\n", + "Lessons Learnt\n", + "Outreach Red Cross post at the Serbian-Hungarian border: Despite the restricted access at the Serbian border areas, the Hungarian Red Cross county branch and national headquarters staff for the operation were able to carry out activities at the designated areas. This included basic PSS and First Aid activities.\n", + "============================\n", + "Lessons Learnt\n", + "A PSS-gender based violence training of trainers was conducted by the IFRC on 10-12 June 2016, where 3 Hungarian trainers took part, so they are able to deliver more training in the future for Hungarian Red Cross personal.\n", + "============================\n", + "Challenges\n", + "There was a lack of motivation and low interest in any community-based activities, which was more prevalent in Körmend where only single men were accommodated. Hungarian Red Cross also faced difficulties with the extremely short average length of stay in the reception centre at Vámosszabadi (usually two days).\n", + "============================\n", + "Lessons Learnt\n", + "Sexual and gender based violence should always be part of psychosocial support trainings in the future. Water, Sanitation and Hygiene Promotion\n", + "============================\n", + "Challenges\n", + "The main challenge was the lack of motivation and low interest in any community-based activities, including hygiene promotion trainings; in addition, the short average length of stay of the migrants as most were just transiting through\n", + "============================\n" + ] + } + ], + "source": [ + "for x in txt:\n", + " classification = clf(x, **tokenizer_kwargs)\n", + " if (classification[0]['score'] >= 0.99):\n", + " if (classification[0]['label']=='Lessons Learnt'):\n", + " print (classification[0]['label'])\n", + " print (x)\n", + " print(\"============================\")\n", + " elif (classification[0]['label']=='Challenges'):\n", + " print (classification[0]['label'])\n", + " print (x)\n", + " print(\"============================\")" ] } ], @@ -106,7 +273,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.8.12" } }, "nbformat": 4,