diff --git a/notebooks/.ipynb_checkpoints/run classifier-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/run classifier-checkpoint.ipynb deleted file mode 100644 index a01c51e..0000000 --- a/notebooks/.ipynb_checkpoints/run classifier-checkpoint.ipynb +++ /dev/null @@ -1,281 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 14, - "id": "bedcd040", - "metadata": {}, - "outputs": [], - "source": [ - "#in ops environment" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "e5301a37", - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "import base64" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "f69e89a4", - "metadata": {}, - "outputs": [], - "source": [ - "from transformers import pipeline " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "d2cc776e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\Paola.YELA\\Anaconda3\\envs\\ops\\lib\\site-packages\\InstructorEmbedding\\instructor.py:7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", - " from tqdm.autonotebook import trange\n" - ] - } - ], - "source": [ - "from InstructorEmbedding import INSTRUCTOR #https://pypi.org/project/InstructorEmbedding/" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "29b3150a", - "metadata": {}, - "outputs": [], - "source": [ - "from deep_parser import TextFromFile # pip install git+https://github.com/the-deep/deepex@newformat " - ] - }, - { - "cell_type": "markdown", - "id": "4ba882c9", - "metadata": {}, - "source": [ - "### 1. Model" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b5bbe202", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "load INSTRUCTOR_Transformer\n", - "max_seq_length 512\n" - ] - } - ], - "source": [ - "emb_model = INSTRUCTOR('hkunlp/instructor-base') #also a large and xl version is available" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d3b2206c", - "metadata": {}, - "outputs": [], - "source": [ - "clf = pipeline(\"text-classification\", model=\"../model/model2-20230818/\", top_k=3)" - ] - }, - { - "cell_type": "markdown", - "id": "bba1a2b5", - "metadata": {}, - "source": [ - "### 2. Example" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "d71bcb4f", - "metadata": {}, - "outputs": [], - "source": [ - "samples = requests.get('https://goadmin.ifrc.org/api/v2/appeal_document/?search=Emergency+Appeal+Final+Report').json()" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "9b16f53c", - "metadata": {}, - "outputs": [], - "source": [ - "sample = samples['results'][0]['document_url']" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "24991815", - "metadata": {}, - "outputs": [], - "source": [ - "req = requests.get(sample)\n", - "document = base64.b64encode(req.content)\n", - "base = TextFromFile(document)\n", - "text, _ = base.extract_text(output_format=\"list\")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "4ce8c396", - "metadata": {}, - "outputs": [], - "source": [ - "min_length = 20" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "c888d333", - "metadata": {}, - "outputs": [], - "source": [ - "txt = [line for page in text for line in page if len(line.split())>=min_length]" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "78624545", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "127" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(txt)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "392e81ec", - "metadata": {}, - "outputs": [], - "source": [ - "tokenizer_kwargs = {\n", - " 'padding': 'max_length',\n", - " 'truncation':True,\n", - " 'max_length': 256, \n", - " 'add_special_tokens': True, \n", - " 'return_token_type_ids':True\n", - " }" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "ef3d7cc9", - "metadata": {}, - "outputs": [], - "source": [ - "classification = clf(txt, **tokenizer_kwargs)" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "19ba0569", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Lessons Learnt\n", - "Looking to the future, the Hungarian Red Cross plans to continue responding with migration related activities in the coming months. Currently it is not yet possible to indicate details of activities and locations as the Hungarian government has announced new regulations that will change the locations and the type of facilities. There is a tendency to move towards closed facilities and transit zones, where Hungarian Red Cross access is currently under discussion with the authorities.\n", - "============================\n", - "Lessons Learnt\n", - "Strengthening the skills of the National Society volunteers for the provision of the assistance to the above- mentioned through proper training;\n", - "============================\n", - "Lessons Learnt\n", - "Reinforcing the response capacities of the National Society in order to be prepared to react in case an increase in humanitarian needs takes place in the coming months, a scenario that considers the fluctuations in the migration\n", - "============================\n", - "Lessons Learnt\n", - "Outreach Red Cross post at the Serbian-Hungarian border: Despite the restricted access at the Serbian border areas, the Hungarian Red Cross county branch and national headquarters staff for the operation were able to carry out activities at the designated areas. This included basic PSS and First Aid activities.\n", - "============================\n", - "Lessons Learnt\n", - "A PSS-gender based violence training of trainers was conducted by the IFRC on 10-12 June 2016, where 3 Hungarian trainers took part, so they are able to deliver more training in the future for Hungarian Red Cross personal.\n", - "============================\n", - "Challenges\n", - "There was a lack of motivation and low interest in any community-based activities, which was more prevalent in Körmend where only single men were accommodated. Hungarian Red Cross also faced difficulties with the extremely short average length of stay in the reception centre at Vámosszabadi (usually two days).\n", - "============================\n", - "Lessons Learnt\n", - "Sexual and gender based violence should always be part of psychosocial support trainings in the future. Water, Sanitation and Hygiene Promotion\n", - "============================\n", - "Challenges\n", - "The main challenge was the lack of motivation and low interest in any community-based activities, including hygiene promotion trainings; in addition, the short average length of stay of the migrants as most were just transiting through\n", - "============================\n" - ] - } - ], - "source": [ - "for x in txt:\n", - " classification = clf(x, **tokenizer_kwargs)\n", - " if (classification[0]['score'] >= 0.99):\n", - " if (classification[0]['label']=='Lessons Learnt'):\n", - " print (classification[0]['label'])\n", - " print (x)\n", - " print(\"============================\")\n", - " elif (classification[0]['label']=='Challenges'):\n", - " print (classification[0]['label'])\n", - " print (x)\n", - " print(\"============================\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}