-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
279 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
/model | ||
/data | ||
/data | ||
/notebooks/.ipynb_checkpoints |
279 changes: 277 additions & 2 deletions
279
notebooks/.ipynb_checkpoints/run classifier-checkpoint.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,281 @@ | ||
{ | ||
"cells": [], | ||
"metadata": {}, | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 14, | ||
"id": "bedcd040", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#in ops environment" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 27, | ||
"id": "e5301a37", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import requests\n", | ||
"import base64" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "f69e89a4", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from transformers import pipeline " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "d2cc776e", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"C:\\Users\\Paola.YELA\\Anaconda3\\envs\\ops\\lib\\site-packages\\InstructorEmbedding\\instructor.py:7: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", | ||
" from tqdm.autonotebook import trange\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from InstructorEmbedding import INSTRUCTOR #https://pypi.org/project/InstructorEmbedding/" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 25, | ||
"id": "29b3150a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from deep_parser import TextFromFile # pip install git+https://github.com/the-deep/deepex@newformat " | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "4ba882c9", | ||
"metadata": {}, | ||
"source": [ | ||
"### 1. Model" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "b5bbe202", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"load INSTRUCTOR_Transformer\n", | ||
"max_seq_length 512\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"emb_model = INSTRUCTOR('hkunlp/instructor-base') #also a large and xl version is available" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"id": "d3b2206c", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"clf = pipeline(\"text-classification\", model=\"../model/model2-20230818/\", top_k=3)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "bba1a2b5", | ||
"metadata": {}, | ||
"source": [ | ||
"### 2. Example" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 20, | ||
"id": "d71bcb4f", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"samples = requests.get('https://goadmin.ifrc.org/api/v2/appeal_document/?search=Emergency+Appeal+Final+Report').json()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 21, | ||
"id": "9b16f53c", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"sample = samples['results'][0]['document_url']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 28, | ||
"id": "24991815", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"req = requests.get(sample)\n", | ||
"document = base64.b64encode(req.content)\n", | ||
"base = TextFromFile(document)\n", | ||
"text, _ = base.extract_text(output_format=\"list\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 33, | ||
"id": "4ce8c396", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"min_length = 20" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 34, | ||
"id": "c888d333", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"txt = [line for page in text for line in page if len(line.split())>=min_length]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 37, | ||
"id": "78624545", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"127" | ||
] | ||
}, | ||
"execution_count": 37, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(txt)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"id": "392e81ec", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tokenizer_kwargs = {\n", | ||
" 'padding': 'max_length',\n", | ||
" 'truncation':True,\n", | ||
" 'max_length': 256, \n", | ||
" 'add_special_tokens': True, \n", | ||
" 'return_token_type_ids':True\n", | ||
" }" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"id": "ef3d7cc9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"classification = clf(txt, **tokenizer_kwargs)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 46, | ||
"id": "19ba0569", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Lessons Learnt\n", | ||
"Looking to the future, the Hungarian Red Cross plans to continue responding with migration related activities in the coming months. Currently it is not yet possible to indicate details of activities and locations as the Hungarian government has announced new regulations that will change the locations and the type of facilities. There is a tendency to move towards closed facilities and transit zones, where Hungarian Red Cross access is currently under discussion with the authorities.\n", | ||
"============================\n", | ||
"Lessons Learnt\n", | ||
"Strengthening the skills of the National Society volunteers for the provision of the assistance to the above- mentioned through proper training;\n", | ||
"============================\n", | ||
"Lessons Learnt\n", | ||
"Reinforcing the response capacities of the National Society in order to be prepared to react in case an increase in humanitarian needs takes place in the coming months, a scenario that considers the fluctuations in the migration\n", | ||
"============================\n", | ||
"Lessons Learnt\n", | ||
"Outreach Red Cross post at the Serbian-Hungarian border: Despite the restricted access at the Serbian border areas, the Hungarian Red Cross county branch and national headquarters staff for the operation were able to carry out activities at the designated areas. This included basic PSS and First Aid activities.\n", | ||
"============================\n", | ||
"Lessons Learnt\n", | ||
"A PSS-gender based violence training of trainers was conducted by the IFRC on 10-12 June 2016, where 3 Hungarian trainers took part, so they are able to deliver more training in the future for Hungarian Red Cross personal.\n", | ||
"============================\n", | ||
"Challenges\n", | ||
"There was a lack of motivation and low interest in any community-based activities, which was more prevalent in Körmend where only single men were accommodated. Hungarian Red Cross also faced difficulties with the extremely short average length of stay in the reception centre at Vámosszabadi (usually two days).\n", | ||
"============================\n", | ||
"Lessons Learnt\n", | ||
"Sexual and gender based violence should always be part of psychosocial support trainings in the future. Water, Sanitation and Hygiene Promotion\n", | ||
"============================\n", | ||
"Challenges\n", | ||
"The main challenge was the lack of motivation and low interest in any community-based activities, including hygiene promotion trainings; in addition, the short average length of stay of the migrants as most were just transiting through\n", | ||
"============================\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for x in txt:\n", | ||
" classification = clf(x, **tokenizer_kwargs)\n", | ||
" if (classification[0]['score'] >= 0.99):\n", | ||
" if (classification[0]['label']=='Lessons Learnt'):\n", | ||
" print (classification[0]['label'])\n", | ||
" print (x)\n", | ||
" print(\"============================\")\n", | ||
" elif (classification[0]['label']=='Challenges'):\n", | ||
" print (classification[0]['label'])\n", | ||
" print (x)\n", | ||
" print(\"============================\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.8.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |