diff --git a/featurize_news.ipynb b/featurize_news.ipynb
index c6e5c66..71079a3 100644
--- a/featurize_news.ipynb
+++ b/featurize_news.ipynb
@@ -1,508 +1,1318 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "provenance": [],
- "mount_file_id": "1m9v35TGO0OOMzlT4exu7ahfYXXRampiz",
- "authorship_tag": "ABX9TyPyYfgejErg0T3/L14UPodC",
- "include_colab_link": true
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
- },
- "language_info": {
- "name": "python"
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ ""
+ ]
},
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "aa44fa70cbf043ea9109a44c1348cb2b": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "HBoxModel",
- "model_module_version": "1.5.0",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_755207a276dc423e982fd2433783aca9",
- "IPY_MODEL_34a1c3fafe3840f8aef86fc0ce16d9d4",
- "IPY_MODEL_09d934d3d6664434880da3f52cb52efc"
- ],
- "layout": "IPY_MODEL_d849f9c8e894473eb054103b6c0965cc"
- }
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 351,
+ "referenced_widgets": [
+ "67945d3f3b26409a98f6a4e91dd77380",
+ "fe0fdd93415c4970b6428e0b66c30681",
+ "69318b3a6a6749dbbbbdb87b506e94cb",
+ "ddacb928a856483481546b838556ac42",
+ "4377321d9ca04dd4ab10fe6ff82cf160",
+ "cef9e8b68be64849bcc259f7b5dc5e1f",
+ "d9da04e84c9144b6aabd6e5f23d1614a",
+ "1ee8c124f9494caf886773be63a2bfd7",
+ "2bcf94618d054443b3671bc38881af6d",
+ "120086e5de314c8b84dfa6d5bcaac2bc",
+ "b14c3547ac08454db2d01b25aa7dfddf",
+ "33f1dc7b277d4ec5b07aad03cd914c52",
+ "9038781a628241e191b7ed46cb3a6650",
+ "6f6a1228068b40e3b2417bd8c4b885cc",
+ "33e4be715486454389f2607defbd4550",
+ "4a8724d7531b494db9db7e66b2535244",
+ "62d0aabee46f40ea97ffae6f2bce5e40",
+ "63ea2925b4c34ae28af663b19e2ddf67",
+ "ccf0ea6bd9e140a6a24ac62efe0bdd67",
+ "7ececbbebe5f48fb81d77a8b91d48064",
+ "964d5ad0e62f43fda6bd9dae591d397c",
+ "76b2e19c188c4612afcb34bdc0ff58a2",
+ "f9fa031e785f4335a3fc4a36d611e207",
+ "a739a10becc8457b8179a669b4e72bf2",
+ "f17c69db88f646ca9c2bf1cc354fc907",
+ "94af3b28929a481abf532b49c3990691",
+ "ed26469a161c42df9f6c1070b84be68b",
+ "613313e837544f1fac967279c5746c9f",
+ "67488582ce1940cfbf07c577758f0b1d",
+ "605d95933565413886ff6508b7d20453",
+ "d3981ae1e68d4ad0abd704ea785df995",
+ "877edc5bb3894dc3935edfeaaf726a8f",
+ "c2123ea20a684903b82dc376a409c682",
+ "c58ce8753ead4874b0d81bab8f70a7bd",
+ "7cabc65f15ef4aedb3a7ce30fb4aac8d",
+ "96e4be4c35cc45feac1461ae5271bdc4",
+ "63e14cc87e1946a8876a4a9f2d1281e8",
+ "dd844b2e73f74885a1f9a336fbffa0e7",
+ "f3b91c2405d4476bb05d23ffec46c3c0",
+ "ec83790b87be401286f894faf430a8c4",
+ "f029693037ed49049bcd87b3d9683b2d",
+ "2623c225378c495485e0f63d38cc7925",
+ "bd79b8f8fdbb487f9142bf2090729f1b",
+ "eb545ae5ddf8409883198d2e5bab0745",
+ "30725e31e2c84e40bfc1b36f85a52c99",
+ "edb2ec97694341d19092a23c8be4e0bf",
+ "7073179f100644718ee8c660da9a1b2d",
+ "b59153f234b84236bf9fad4100c30b4c",
+ "7a761207d5e34d55b955587d247458bf",
+ "b0446147f04946e490e51e1f353fcd51",
+ "1594def1db68427b92d67948fb2c0b61",
+ "a5497d52e9b2414fb5624bd5a6caa51a",
+ "c4dce35597ce42b58b3734d5f9e11b59",
+ "9f69be914ec84dbdabb5766cb4148cba",
+ "7f1cc79f9f9847c58d92c055213f4588",
+ "64f3f713a80e4d67b7cdb012c7a1b73d"
+ ]
},
- "755207a276dc423e982fd2433783aca9": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "HTMLModel",
- "model_module_version": "1.5.0",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_0e51337eb87e430b8e477488361c52e4",
- "placeholder": "",
- "style": "IPY_MODEL_977f5053e46a432fb4d446ce55b13ac1",
- "value": "config.json: 100%"
- }
+ "id": "XNPa1N_VETtt",
+ "outputId": "d681349b-2436-4e85-d28c-3ca6ca3ef946"
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
+ "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
+ "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
+ "You will be able to reuse this secret in all of your notebooks.\n",
+ "Please note that authentication is recommended but still optional to access public models or datasets.\n",
+ " warnings.warn(\n"
+ ]
},
- "34a1c3fafe3840f8aef86fc0ce16d9d4": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "FloatProgressModel",
- "model_module_version": "1.5.0",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_91b626119aab4cdab59a02609a8caa49",
- "max": 480,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_c3fe50a7e98347fe8fb51c7692c87d4f",
- "value": 480
- }
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "67945d3f3b26409a98f6a4e91dd77380",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "config.json: 0%| | 0.00/480 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
- "09d934d3d6664434880da3f52cb52efc": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "HTMLModel",
- "model_module_version": "1.5.0",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_00eee16604c6408483e154b4df8a7595",
- "placeholder": "",
- "style": "IPY_MODEL_9e5d9dc852074d589a2692db4222dbd7",
- "value": " 480/480 [00:00<00:00, 17.5kB/s]"
- }
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fe0fdd93415c4970b6428e0b66c30681",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "model.safetensors: 0%| | 0.00/331M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
- "d849f9c8e894473eb054103b6c0965cc": {
- "model_module": "@jupyter-widgets/base",
- "model_name": "LayoutModel",
- "model_module_version": "1.2.0",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "9038781a628241e191b7ed46cb3a6650",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer_config.json: 0%| | 0.00/25.0 [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
- "0e51337eb87e430b8e477488361c52e4": {
- "model_module": "@jupyter-widgets/base",
- "model_name": "LayoutModel",
- "model_module_version": "1.2.0",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "a739a10becc8457b8179a669b4e72bf2",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "vocab.json: 0%| | 0.00/899k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
- "977f5053e46a432fb4d446ce55b13ac1": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "DescriptionStyleModel",
- "model_module_version": "1.5.0",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
- },
- "91b626119aab4cdab59a02609a8caa49": {
- "model_module": "@jupyter-widgets/base",
- "model_name": "LayoutModel",
- "model_module_version": "1.2.0",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "c3fe50a7e98347fe8fb51c7692c87d4f": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "ProgressStyleModel",
- "model_module_version": "1.5.0",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "ProgressStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "bar_color": null,
- "description_width": ""
- }
- },
- "00eee16604c6408483e154b4df8a7595": {
- "model_module": "@jupyter-widgets/base",
- "model_name": "LayoutModel",
- "model_module_version": "1.2.0",
- "state": {
- "_model_module": "@jupyter-widgets/base",
- "_model_module_version": "1.2.0",
- "_model_name": "LayoutModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "LayoutView",
- "align_content": null,
- "align_items": null,
- "align_self": null,
- "border": null,
- "bottom": null,
- "display": null,
- "flex": null,
- "flex_flow": null,
- "grid_area": null,
- "grid_auto_columns": null,
- "grid_auto_flow": null,
- "grid_auto_rows": null,
- "grid_column": null,
- "grid_gap": null,
- "grid_row": null,
- "grid_template_areas": null,
- "grid_template_columns": null,
- "grid_template_rows": null,
- "height": null,
- "justify_content": null,
- "justify_items": null,
- "left": null,
- "margin": null,
- "max_height": null,
- "max_width": null,
- "min_height": null,
- "min_width": null,
- "object_fit": null,
- "object_position": null,
- "order": null,
- "overflow": null,
- "overflow_x": null,
- "overflow_y": null,
- "padding": null,
- "right": null,
- "top": null,
- "visibility": null,
- "width": null
- }
- },
- "9e5d9dc852074d589a2692db4222dbd7": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "DescriptionStyleModel",
- "model_module_version": "1.5.0",
- "state": {
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "DescriptionStyleModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/base",
- "_view_module_version": "1.2.0",
- "_view_name": "StyleView",
- "description_width": ""
- }
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7cabc65f15ef4aedb3a7ce30fb4aac8d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "merges.txt: 0%| | 0.00/456k [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
- "94928fd867d94a228be7035b5632b8ab": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "HBoxModel",
- "model_module_version": "1.5.0",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HBoxModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HBoxView",
- "box_style": "",
- "children": [
- "IPY_MODEL_62ec1379185246b89888f0f85966de71",
- "IPY_MODEL_f88535ec5fd74d849bb4e508a92ec281",
- "IPY_MODEL_5a79919899c04baea70463a684eb3459"
- ],
- "layout": "IPY_MODEL_96b1e718addb4128944636e5c55bd67f"
- }
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "edb2ec97694341d19092a23c8be4e0bf",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "tokenizer.json: 0%| | 0.00/1.36M [00:00, ?B/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
},
- "62ec1379185246b89888f0f85966de71": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "HTMLModel",
- "model_module_version": "1.5.0",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_2451525358b046faa4dd7755c3ca7d3b",
- "placeholder": "",
- "style": "IPY_MODEL_832bd5ed2a884f749d5813e817a66cfe",
- "value": "model.safetensors: 100%"
- }
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
+ "[nltk_data] Unzipping tokenizers/punkt.zip.\n"
+ ]
},
- "f88535ec5fd74d849bb4e508a92ec281": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "FloatProgressModel",
- "model_module_version": "1.5.0",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "FloatProgressModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "ProgressView",
- "bar_style": "success",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_1763d95c639e4cf2b1ec9d9a0126162a",
- "max": 331055963,
- "min": 0,
- "orientation": "horizontal",
- "style": "IPY_MODEL_5bb4ae3ced224927a201dbd623de447c",
- "value": 331055963
- }
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "## adapted from https://jaketae.github.io/study/keyword-extraction/#candidate-selection\n",
+ "import spacy\n",
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "from transformers import AutoModel, AutoTokenizer\n",
+ "import torch\n",
+ "\n",
+ "model_name = \"distilroberta-base\"\n",
+ "model = AutoModel.from_pretrained(model_name)\n",
+ "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+ "nlp = spacy.load('en_core_web_sm')\n",
+ "n_gram_range = (1, 2)\n",
+ "stop_words = \"english\"\n",
+ "embeddings=[]\n",
+ "\n",
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+ "model.to(device)\n",
+ "\n",
+ "import nltk, string, numpy as np\n",
+ "nltk.download('punkt')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
},
- "5a79919899c04baea70463a684eb3459": {
- "model_module": "@jupyter-widgets/controls",
- "model_name": "HTMLModel",
- "model_module_version": "1.5.0",
- "state": {
- "_dom_classes": [],
- "_model_module": "@jupyter-widgets/controls",
- "_model_module_version": "1.5.0",
- "_model_name": "HTMLModel",
- "_view_count": null,
- "_view_module": "@jupyter-widgets/controls",
- "_view_module_version": "1.5.0",
- "_view_name": "HTMLView",
- "description": "",
- "description_tooltip": null,
- "layout": "IPY_MODEL_03e314ef83374ef8a2c3c8ce85768195",
- "placeholder": "",
- "style": "IPY_MODEL_9ff5ea43dcd84849884a0e2b4e1e861c",
- "value": " 331M/331M [00:02<00:00, 169MB/s]"
- }
+ "id": "Y4miUEh-WD7G",
+ "outputId": "f3a35370-dfbe-43b0-e6e0-d4c609ee3ce9"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "device(type='cpu')"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "device"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "RJjNm4qvEZoD"
+ },
+ "outputs": [],
+ "source": [
+ "def get_npr_stories(p):\n",
+ " # Send a GET request to the NPR API\n",
+ " r = requests.get(\"http://api.npr.org/query?apiKey=MDE5Mzg3Mjc2MDE0MzMyMjM3NjM5ZTI2Ng001\", params=p)\n",
+ "\n",
+ " # Parse the XML response to get the story URLs\n",
+ " root = ET.fromstring(r.content)\n",
+ " story_urls = [story.find('link').text for story in root.iter('story')]\n",
+ "\n",
+ " # For each story URL, send a GET request to get the HTML content\n",
+ " full_stories = []\n",
+ " for url in story_urls:\n",
+ " response = requests.get(url)\n",
+ " soup = BeautifulSoup(response.text, 'html.parser')\n",
+ "\n",
+ " # Find the main content of the story. This will depend on the structure of the webpage.\n",
+ " # Here, we're assuming that the main content is in a
tag. You might need to adjust this depending on the webpage structure.\n", + " story = soup.find_all('p')\n", + "\n", + " # Extract the text from the story\n", + " full_story = ' '.join(p.text for p in story)\n", + " full_stories.append(full_story)\n", + " return full_stories\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EnFhQIkrEZs5" + }, + "outputs": [], + "source": [ + "def chunk_text(text, max_len):\n", + " # Tokenize the text into tokens\n", + " tokens = nltk.word_tokenize(text)\n", + "\n", + " # Calculate the number of chunks and the size of the final chunk\n", + " num_chunks = len(tokens) // max_len\n", + " final_chunk_size = len(tokens) % max_len\n", + "\n", + " # If the final chunk is too small, distribute its tokens among the other chunks\n", + " if final_chunk_size < max_len / 2:\n", + " num_chunks += 1\n", + " chunk_sizes = [len(tokens) // num_chunks + (1 if i < len(tokens) % num_chunks else 0) for i in range(num_chunks)]\n", + " chunks = [tokens[sum(chunk_sizes[:i]):sum(chunk_sizes[:i+1])] for i in range(num_chunks)]\n", + " else:\n", + " chunks = [tokens[i:i + max_len] for i in range(0, len(tokens), max_len)]\n", + "\n", + " return chunks\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RR_2MZ5nEZvO" + }, + "outputs": [], + "source": [ + "from torch.utils.data import DataLoader\n", + "\n", + "def featurize_stories(text, max_len, top_k):\n", + " # Extract candidate words/phrases\n", + " count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])\n", + " all_candidates = count.get_feature_names_out()\n", + " doc = nlp(text)\n", + " noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)\n", + " nouns = set()\n", + " for token in doc:\n", + " if token.pos_ == \"NOUN\":\n", + " nouns.add(token.text)\n", + "\n", + " all_nouns = nouns.union(noun_phrases)\n", + " candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))\n", + " candidate_tokens = tokenizer(candidates, padding=True, return_tensors=\"pt\")\n", + " candidate_tokens = {k: v.to(device) for k, v in (candidate_tokens).items()}\n", + " candidate_embeddings = model(**candidate_tokens)[\"pooler_output\"]\n", + " candidate_embeddings = candidate_embeddings.detach()#.to_numpy()\n", + "\n", + " # words = nltk.word_tokenize(text)\n", + " # chunks = [words[i:i + 512] for i in range(0, len(words), 512)]\n", + " chunks = chunk_text(text, max_len) # use this to chunk better and use less padding thus less memory but also less affect from averging\n", + "\n", + " for chunk in chunks:\n", + " text_tokens = tokenizer(chunk, padding=True, return_tensors=\"pt\")\n", + " text_tokens = {k: v.to(device) for k, v in (text_tokens).items()}\n", + " text_embedding = model(**text_tokens)[\"pooler_output\"]\n", + " text_embedding = text_embedding.detach()#.to_numpy()\n", + " embeddings.append(text_embedding)\n", + " max_emb_shape = max(embedding.shape[0] for embedding in embeddings)\n", + " padded_embeddings = [np.pad(embedding.cpu(), ((0, max_emb_shape - embedding.shape[0]), (0, 0))) for embedding in embeddings]\n", + " avg_embedding = np.min(padded_embeddings, axis=0)\n", + " distances = cosine_similarity(avg_embedding, candidate_embeddings.cpu())\n", + " torch.cuda.empty_cache()\n", + " return [candidates[index] for index in distances.argsort()[0][::-1][-top_k:]]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3mlI9cOaEsnS" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "# data=pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet.csv')\n", + "data=pd.read_csv('/content/drive/MyDrive/consult/Louie_california_weather.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "VaqzAonQGa11", + "outputId": "52055c96-4aa9-43cc-84ae-426a86df31ab" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 California Snow Storm Latest as Drivers Trappe...\n", + "1 Large, illegal marijuana grow posed ‘extreme f...\n", + "2 @nytimes The classic California extreme weathe...\n", + "3 extreme weather https://t.co/hp9yOVBkMR \n", + "4 Extreme weather https://t.co/T5QtBjx3z4 \n", + " ... \n", + "9995 @cryptojack Conquer the web3 platform with $BL...\n", + "9996 it’s a hockey game not desert storm\n", + "9997 @NHLFlames But we are having fun and we have a...\n", + "9998 @BlastRunnersPVP 3 days past after last announ...\n", + "9999 @Balkaur07527817 Ok then?\n", + "Name: text, Length: 10000, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data['text']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e5VsbTP7WaSH" + }, + "outputs": [], + "source": [ + "# df = pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features.txt',sep='\\t')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FjoOGK0LCj32" + }, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "background_save": true, + "base_uri": "https://localhost:8080/" + }, + "id": "ta3pWa5BEZx2", + "outputId": "0ea02337-2c56-4903-e553-1462080bc098" + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 2%|▏ | 161/10000 [02:28<2:07:04, 1.29it/s]" + ] + } + ], + "source": [ + "rank_articles=[]\n", + "from tqdm import tqdm\n", + "# for i in tqdm(range(len(df),len(data['text']))):\n", + "dataloader = DataLoader(data['text'], batch_size=32, shuffle=True, num_workers=4)\n", + "for i in tqdm(range(len(dataloader))):\n", + " try:\n", + " cc=featurize_stories(data['text'][i], max_len=512, top_k=4)\n", + " # print(cc)\n", + " rank_articles.append(cc)\n", + " except IndexError:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q3pN0rHtfWon" + }, + "outputs": [], + "source": [ + "flattened_list = [item for sublist in rank_articles for item in sublist]\n", + "from collections import Counter\n", + "counter = Counter(flattened_list)\n", + "df = pd.DataFrame.from_dict(counter, orient='index', columns=['Count'])\n", + "\n", + "df = df.sort_values(by='Count',ascending=False)\n", + "# df.to_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features2.txt',sep='\\t')\n", + "df.to_csv('/content/drive/MyDrive/consult/california_weather_tweet_features.txt',sep='\\t')\n", + "\n", + "print(len(df))\n", + "# df[:25]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "perIfMb9CUcm" + }, + "outputs": [], + "source": [ + "len(rank_articles)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rmd7fnVDebrq" + }, + "outputs": [], + "source": [ + "# df = pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features.txt',sep='\\t')\n", + "# df2=pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features2.txt',sep='\\t')\n", + "\n", + "# df=pd.concat([df,df2])\n", + "# print(df['Unnamed: 0'])\n", + "# df = df.groupby('Unnamed: 0').sum().sort_values(by='Count',ascending=False)\n", + "# df=df[df['Count']>int(np.round(len(df)*.001))]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yLeHkpHP_2aj" + }, + "outputs": [], + "source": [ + "import spacy\n", + "\n", + "nlp = spacy.load('en_core_web_sm')\n", + "\n", + "# nouns = ['apple', 'John', 'London', 'dog', 'Mary', 'Paris', 'banana']\n", + "nouns= df.reset_index()['Unnamed: 0'].to_list()\n", + "doc = nlp(' '.join(nouns))\n", + "\n", + "proper_nouns = [token.text for token in doc if token.pos_ == 'PROPN']\n", + "\n", + "print(proper_nouns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YGiVzRMvFX3X" + }, + "outputs": [], + "source": [ + "print(len(proper_nouns))\n", + "proper_nouns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IDJUwNpyFYSl" + }, + "outputs": [], + "source": [ + "len(nouns)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# run from module" + ], + "metadata": { + "id": "6W7czIuqXnGv" + } + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "Nxgc3toYFllR", + "colab": { + "base_uri": "https://localhost:8080/" }, - "96b1e718addb4128944636e5c55bd67f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null + "outputId": "7bdb39f7-e67f-46a3-ac1b-a1f3a7a14619" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.0/166.0 MB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m51.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Building wheel for mlx-grph (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "CPU times: user 563 ms, sys: 68.6 ms, total: 631 ms\n", + "Wall time: 1min 21s\n" + ] + } + ], + "source": [ + "# %%time\n", + "# !pip install --quiet git+https://github.com/dcolinmorgan/mlx_grph.git" + ] + }, + { + "cell_type": "code", + "source": [ + "import sys,os,argparse,csv\n", + "sys.argv = ['DT_feat.py', '-n', '100', '-f', '3', '-o', 'OS_feats.csv', '-s', '1']\n", + "parser = argparse.ArgumentParser(description='Process OS data for dynamic features.')\n", + "parser.add_argument('-n', type=int, default=10, help='Number of data items to get')\n", + "parser.add_argument('-f', type=int, default=3, help='Number of features per item to get')\n", + "parser.add_argument('-o', type=str, default='OS_feats.csv', help='Output file name')\n", + "parser.add_argument('-s', type=int, default=1, help='Parallelize requests')\n", + "args, unknown = parser.parse_known_args()\n", + "\n", + "from mlx_grph.DT_feat import featurize_stories, process_data, get_data, process_hit" + ], + "metadata": { + "id": "5-qGJ8Ba8soe" + }, + "execution_count": 26, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from google.colab import userdata\n", + "ost=userdata.get('OS_TOKEN')\n", + "text_file = open(\"/usr/local/lib/python3.10/dist-packages/mlx_grph/.env\", \"w\")\n", + "text_file.write('OS_TOKEN='+ost)\n", + "text_file.close()" + ], + "metadata": { + "id": "EOshLVZf93fW" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import logging\n", + "from tqdm import tqdm\n", + "logging.basicConfig(level=logging.INFO)" + ], + "metadata": { + "id": "odrq4fpMa726" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data = get_data(100)\n", + "articles = process_data(data)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sJ-RIyki7A3r", + "outputId": "232d0318-7c10-4803-ea46-bbccc3d10ba5" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "grabbing text from url: 100%|██████████| 100/100 [01:22<00:00, 1.22it/s]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "rank_articles=[]\n", + "for i in tqdm(articles):\n", + " parts=str(i).split('[', 3)\n", + " try:\n", + " cc=featurize_stories(str(i), top_k = args.f, max_len=512)\n", + " cc.append(parts[1])\n", + " rank_articles.append(cc)\n", + " except Exception as e:\n", + " logging.error(f\"Failed to process article: {e}\")\n", + "with open(args.o, 'w', newline='') as file:\n", + " writer = csv.writer(file)\n", + " writer.writerows(rank_articles)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mXdybPuG3xSB", + "outputId": "318f3163-0f94-44ec-9685-b1c0eefc5661" + }, + "execution_count": 42, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + " 0%| | 0/98 [00:00, ?it/s]ERROR:root:Failed to process article: list index out of range\n", + " 15%|█▌ | 15/98 [00:25<02:11, 1.58s/it]ERROR:root:Failed to process article: list index out of range\n", + " 20%|██ | 20/98 [00:31<01:40, 1.29s/it]ERROR:root:Failed to process article: list index out of range\n", + " 36%|███▌ | 35/98 [00:53<01:35, 1.52s/it]ERROR:root:Failed to process article: list index out of range\n", + "ERROR:root:Failed to process article: list index out of range\n", + " 39%|███▉ | 38/98 [00:56<01:09, 1.15s/it]ERROR:root:Failed to process article: list index out of range\n", + " 43%|████▎ | 42/98 [01:01<01:15, 1.35s/it]ERROR:root:Failed to process article: list index out of range\n", + " 48%|████▊ | 47/98 [01:07<01:06, 1.30s/it]ERROR:root:Failed to process article: list index out of range\n", + " 56%|█████▌ | 55/98 [01:18<01:02, 1.45s/it]ERROR:root:Failed to process article: list index out of range\n", + " 60%|██████ | 59/98 [01:23<00:58, 1.50s/it]ERROR:root:Failed to process article: list index out of range\n", + "100%|██████████| 98/98 [02:28<00:00, 1.52s/it]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "rank_articles" + ], + "metadata": { + "id": "y1NRbe_8yeFx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "flattened_list = [item for sublist in rank_articles for item in sublist]\n", + "list_without_brackets = [s.replace(\"[\", \"\").replace(\"]\", \"\") for s in flattened_list]\n", + "\n", + "from collections import Counter\n", + "counter = Counter(list_without_brackets)" + ], + "metadata": { + "id": "VbbafqvPqNn3" + }, + "execution_count": 45, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "df = pd.DataFrame.from_dict(counter, orient='index', columns=['Count'])\n", + "df = df.sort_values(by='Count',ascending=False)\n", + "df = df[~df.index.str.contains(\"united states\")]\n", + "df = df[~df.index.str.contains(\"United States\")]\n", + "df = df[~df.index.str.contains(\"None\")]\n", + "\n", + "df[:25]" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 833 + }, + "id": "94SDiIg2qYhe", + "outputId": "c5fb9023-cbf7-4da4-912b-50703516ea77" + }, + "execution_count": 52, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " Count\n", + "firefighters 13\n", + "afternoon 6\n", + "disease 5\n", + "temperatures 5\n", + "rebels 4\n", + "emergency services 4\n", + "morning 4\n", + "hearst television 3\n", + "ceasefire 3\n", + "heavy snow 3\n", + "evacuation 3\n", + "cease 3\n", + "freezing rain 3\n", + "floor 2\n", + "watsonville 2\n", + "refugees 2\n", + "machines 2\n", + "heavy snowfall 2\n", + "parents 2\n", + "winter weather 2\n", + "widespread snow 2\n", + "beach 2\n", + "'Gaza Strip', '16-01-2024', 2\n", + "quake 2\n", + "rescue 2" + ], + "text/html": [ + "\n", + "
\n", + " | Count | \n", + "
---|---|
firefighters | \n", + "13 | \n", + "
afternoon | \n", + "6 | \n", + "
disease | \n", + "5 | \n", + "
temperatures | \n", + "5 | \n", + "
rebels | \n", + "4 | \n", + "
emergency services | \n", + "4 | \n", + "
morning | \n", + "4 | \n", + "
hearst television | \n", + "3 | \n", + "
ceasefire | \n", + "3 | \n", + "
heavy snow | \n", + "3 | \n", + "
evacuation | \n", + "3 | \n", + "
cease | \n", + "3 | \n", + "
freezing rain | \n", + "3 | \n", + "
floor | \n", + "2 | \n", + "
watsonville | \n", + "2 | \n", + "
refugees | \n", + "2 | \n", + "
machines | \n", + "2 | \n", + "
heavy snowfall | \n", + "2 | \n", + "
parents | \n", + "2 | \n", + "
winter weather | \n", + "2 | \n", + "
widespread snow | \n", + "2 | \n", + "
beach | \n", + "2 | \n", + "
'Gaza Strip', '16-01-2024', | \n", + "2 | \n", + "
quake | \n", + "2 | \n", + "
rescue | \n", + "2 | \n", + "
tag. You might need to adjust this depending on the webpage structure.\n", - " story = soup.find_all('p')\n", - "\n", - " # Extract the text from the story\n", - " full_story = ' '.join(p.text for p in story)\n", - " full_stories.append(full_story)\n", - " return full_stories\n" - ], - "metadata": { - "id": "RJjNm4qvEZoD" - }, - "execution_count": 3, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "def chunk_text(text, max_len):\n", - " # Tokenize the text into tokens\n", - " tokens = nltk.word_tokenize(text)\n", - "\n", - " # Calculate the number of chunks and the size of the final chunk\n", - " num_chunks = len(tokens) // max_len\n", - " final_chunk_size = len(tokens) % max_len\n", - "\n", - " # If the final chunk is too small, distribute its tokens among the other chunks\n", - " if final_chunk_size < max_len / 2:\n", - " num_chunks += 1\n", - " chunk_sizes = [len(tokens) // num_chunks + (1 if i < len(tokens) % num_chunks else 0) for i in range(num_chunks)]\n", - " chunks = [tokens[sum(chunk_sizes[:i]):sum(chunk_sizes[:i+1])] for i in range(num_chunks)]\n", - " else:\n", - " chunks = [tokens[i:i + max_len] for i in range(0, len(tokens), max_len)]\n", - "\n", - " return chunks\n" - ], - "metadata": { - "id": "EnFhQIkrEZs5" - }, - "execution_count": 4, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "from torch.utils.data import DataLoader\n", - "\n", - "def featurize_stories(text, max_len, top_k):\n", - " # Extract candidate words/phrases\n", - " count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])\n", - " all_candidates = count.get_feature_names_out()\n", - " doc = nlp(text)\n", - " noun_phrases = set(chunk.text.strip().lower() for chunk in doc.noun_chunks)\n", - " nouns = set()\n", - " for token in doc:\n", - " if token.pos_ == \"NOUN\":\n", - " nouns.add(token.text)\n", - "\n", - " all_nouns = nouns.union(noun_phrases)\n", - " candidates = list(filter(lambda candidate: candidate in all_nouns, all_candidates))\n", - " candidate_tokens = tokenizer(candidates, padding=True, return_tensors=\"pt\")\n", - " candidate_tokens = {k: v.to(device) for k, v in (candidate_tokens).items()}\n", - " candidate_embeddings = model(**candidate_tokens)[\"pooler_output\"]\n", - " candidate_embeddings = candidate_embeddings.detach()#.to_numpy()\n", - "\n", - " # words = nltk.word_tokenize(text)\n", - " # chunks = [words[i:i + 512] for i in range(0, len(words), 512)]\n", - " chunks = chunk_text(text, max_len) # use this to chunk better and use less padding thus less memory but also less affect from averging\n", - "\n", - " for chunk in chunks:\n", - " text_tokens = tokenizer(chunk, padding=True, return_tensors=\"pt\")\n", - " text_tokens = {k: v.to(device) for k, v in (text_tokens).items()}\n", - " text_embedding = model(**text_tokens)[\"pooler_output\"]\n", - " text_embedding = text_embedding.detach()#.to_numpy()\n", - " embeddings.append(text_embedding)\n", - " max_emb_shape = max(embedding.shape[0] for embedding in embeddings)\n", - " padded_embeddings = [np.pad(embedding.cpu(), ((0, max_emb_shape - embedding.shape[0]), (0, 0))) for embedding in embeddings]\n", - " avg_embedding = np.min(padded_embeddings, axis=0)\n", - " distances = cosine_similarity(avg_embedding, candidate_embeddings.cpu())\n", - " torch.cuda.empty_cache()\n", - " return [candidates[index] for index in distances.argsort()[0][::-1][-top_k:]]\n", - "\n" - ], - "metadata": { - "id": "RR_2MZ5nEZvO" - }, - "execution_count": 5, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "# data=pd.read_csv('/content/drive/MyDrive/consult/Louie_disaster_tweets.csv',header=None)\n", - "data=pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet.csv')" - ], - "metadata": { - "id": "3mlI9cOaEsnS" - }, - "execution_count": 14, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "data['text']" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "ec83790b87be401286f894faf430a8c4": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - "id": "VaqzAonQGa11", - "outputId": "eb0ccd0e-95d9-4ecb-ff5a-bd4fc20cae1d" - }, - "execution_count": 15, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "0 #DoNothingDeSantis\\nWith Hurricane Season Loom...\n", - "1 @drscribblesmd I'll be so glad when hurricane ...\n", - "2 @wideawake_media @useful_eater The Florida hur...\n", - "3 @135knots The hurricane landfall better be awa...\n", - "4 @WSJ It’s Hurricane season. \n", - " ... \n", - "8881 @Shiri8580 @Chandrakbose Ur story of Savarkar ...\n", - "8882 Accuracy of labeling refers to the ingredients...\n", - "8883 @citizentvkenya Is she even still a Bishop? An...\n", - "8884 🔥 New RetroDrop : DistricOne x OpenLaverage \\n...\n", - "8885 @alikous I think what makes him more willing i...\n", - "Name: text, Length: 8886, dtype: object" - ] - }, - "metadata": {}, - "execution_count": 15 - } - ] - }, - { - "cell_type": "code", - "source": [ - "df = pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features.txt',sep='\\t')" - ], - "metadata": { - "id": "e5VsbTP7WaSH" - }, - "execution_count": 16, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "rank_articles=[]\n", - "from tqdm import tqdm\n", - "for i in tqdm(range(len(df),len(data['text']))):\n", - " try:\n", - " cc=featurize_stories(data['text'][i], max_len=512, top_k=4)\n", - " # print(cc)\n", - " rank_articles.append(cc)\n", - " except IndexError:\n", - " pass" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "ed26469a161c42df9f6c1070b84be68b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c2123ea20a684903b82dc376a409c682", + "placeholder": "", + "style": "IPY_MODEL_c58ce8753ead4874b0d81bab8f70a7bd", + "value": " 899k/899k [00:00<00:00, 1.13MB/s]" + } }, - "id": "ta3pWa5BEZx2", - "outputId": "57d328e4-81f7-4aa0-c629-afe56889c4ad" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - " 83%|████████▎ | 6098/7321 [2:06:39<48:04, 2.36s/it]" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "flattened_list = [item for sublist in rank_articles for item in sublist]\n", - "from collections import Counter\n", - "counter = Counter(flattened_list)\n", - "df = pd.DataFrame.from_dict(counter, orient='index', columns=['Count'])\n", - "\n", - "df = df.sort_values(by='Count',ascending=False)\n", - "df.to_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features2.txt',sep='\\t')\n", - "print(len(df))\n", - "# df[:25]" - ], - "metadata": { - "id": "Q3pN0rHtfWon" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "df = pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features.txt',sep='\\t')\n", - "df2=pd.read_csv('/content/drive/MyDrive/consult/florida-hurricane-tweet_features2.txt',sep='\\t')\n", - "\n", - "df=pd.concat([df,df2])\n", - "print(df['Unnamed: 0'])\n", - "df = df.groupby('Unnamed: 0').sum().sort_values(by='Count',ascending=False)\n", - "df=df[df['Count']>int(np.round(len(df)*.001))]" - ], - "metadata": { - "id": "rmd7fnVDebrq", - "colab": { - "base_uri": "https://localhost:8080/" + "edb2ec97694341d19092a23c8be4e0bf": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_7073179f100644718ee8c660da9a1b2d", + "IPY_MODEL_b59153f234b84236bf9fad4100c30b4c", + "IPY_MODEL_7a761207d5e34d55b955587d247458bf" + ], + "layout": "IPY_MODEL_b0446147f04946e490e51e1f353fcd51" + } }, - "outputId": "7bc9282e-cdf8-4fa3-f8de-b3c52b10f825" - }, - "execution_count": 45, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0 storm\n", - "1 tornado\n", - "2 weather\n", - "3 snow\n", - "4 day\n", - " ... \n", - "6020 witnesses\n", - "6021 fleece\n", - "6022 fleas\n", - "6023 ensemble members\n", - "6024 walkway\n", - "Name: Unnamed: 0, Length: 7590, dtype: object\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "import spacy\n", - "\n", - "nlp = spacy.load('en_core_web_sm')\n", - "\n", - "# nouns = ['apple', 'John', 'London', 'dog', 'Mary', 'Paris', 'banana']\n", - "nouns= df.reset_index()['Unnamed: 0'].to_list()\n", - "doc = nlp(' '.join(nouns))\n", - "\n", - "proper_nouns = [token.text for token in doc if token.pos_ == 'PROPN']\n", - "\n", - "print(proper_nouns)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "f029693037ed49049bcd87b3d9683b2d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "id": "yLeHkpHP_2aj", - "outputId": "9a041c28-2188-40aa-ff4c-81d78e83e78c" - }, - "execution_count": 46, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "['tornado', 'lol', 'tokens', 'march', 'partnership', 'horizon', 'coast', 'times', 'texas', 'california', 'desantis', 'feet', 'saturday', 'israel', 'tornado', 'february', 'god', 'friday', 'crime', 'america', 'neil', 'oliver', 'couple', 'st', 'gaza', 'outlook', 'anticipation', 'umbrella', 'flwx', 'house', 'drought', 'canada', 'sun', 'sunday', 'hell', 'games', 'fans', 'illinois', 'space', 'party', 'mother', 'chicago', 'new', 'england', 'baby', 'mother', 'ٺون', 'tuesday', 'biden', 'russia', 'terrorists', 'county', 'thursday', 'rainbow', 'disaster', 'luck', 'nevada', 'women', 'ocean', 'kids', 'sierra', 'nevada', 'warmth', 'mar', 'safety', 'youtube', 'wednesday', 'hamas', 'mexico', 'miami', 'deluge', 'anxiety', 'supercell', 'counties', 'southern', 'california', 'air', 'hope', 'yield', 'monday', 'txwx', 'ca', 'seas', 'ilwx', 'movie', 'round', 'kansas', 'fan']\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "print(len(proper_nouns))\n", - "proper_nouns" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "f17c69db88f646ca9c2bf1cc354fc907": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_67488582ce1940cfbf07c577758f0b1d", + "placeholder": "", + "style": "IPY_MODEL_605d95933565413886ff6508b7d20453", + "value": "vocab.json: 100%" + } }, - "id": "YGiVzRMvFX3X", - "outputId": "adc2b1a6-c0cc-40c6-bee5-003b6839c87d" - }, - "execution_count": 47, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "88\n" - ] + "f3b91c2405d4476bb05d23ffec46c3c0": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['tornado',\n", - " 'lol',\n", - " 'tokens',\n", - " 'march',\n", - " 'partnership',\n", - " 'horizon',\n", - " 'coast',\n", - " 'times',\n", - " 'texas',\n", - " 'california',\n", - " 'desantis',\n", - " 'feet',\n", - " 'saturday',\n", - " 'israel',\n", - " 'tornado',\n", - " 'february',\n", - " 'god',\n", - " 'friday',\n", - " 'crime',\n", - " 'america',\n", - " 'neil',\n", - " 'oliver',\n", - " 'couple',\n", - " 'st',\n", - " 'gaza',\n", - " 'outlook',\n", - " 'anticipation',\n", - " 'umbrella',\n", - " 'flwx',\n", - " 'house',\n", - " 'drought',\n", - " 'canada',\n", - " 'sun',\n", - " 'sunday',\n", - " 'hell',\n", - " 'games',\n", - " 'fans',\n", - " 'illinois',\n", - " 'space',\n", - " 'party',\n", - " 'mother',\n", - " 'chicago',\n", - " 'new',\n", - " 'england',\n", - " 'baby',\n", - " 'mother',\n", - " 'ٺون',\n", - " 'tuesday',\n", - " 'biden',\n", - " 'russia',\n", - " 'terrorists',\n", - " 'county',\n", - " 'thursday',\n", - " 'rainbow',\n", - " 'disaster',\n", - " 'luck',\n", - " 'nevada',\n", - " 'women',\n", - " 'ocean',\n", - " 'kids',\n", - " 'sierra',\n", - " 'nevada',\n", - " 'warmth',\n", - " 'mar',\n", - " 'safety',\n", - " 'youtube',\n", - " 'wednesday',\n", - " 'hamas',\n", - " 'mexico',\n", - " 'miami',\n", - " 'deluge',\n", - " 'anxiety',\n", - " 'supercell',\n", - " 'counties',\n", - " 'southern',\n", - " 'california',\n", - " 'air',\n", - " 'hope',\n", - " 'yield',\n", - " 'monday',\n", - " 'txwx',\n", - " 'ca',\n", - " 'seas',\n", - " 'ilwx',\n", - " 'movie',\n", - " 'round',\n", - " 'kansas',\n", - " 'fan']" - ] - }, - "metadata": {}, - "execution_count": 47 - } - ] - }, - { - "cell_type": "code", - "source": [ - "len(nouns)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "f9fa031e785f4335a3fc4a36d611e207": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - "id": "IDJUwNpyFYSl", - "outputId": "4e50b25b-5b07-4bc6-b431-afa792c26c0c" - }, - "execution_count": 48, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "438" - ] - }, - "metadata": {}, - "execution_count": 48 + "fe0fdd93415c4970b6428e0b66c30681": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_69318b3a6a6749dbbbbdb87b506e94cb", + "IPY_MODEL_ddacb928a856483481546b838556ac42", + "IPY_MODEL_4377321d9ca04dd4ab10fe6ff82cf160" + ], + "layout": "IPY_MODEL_cef9e8b68be64849bcc259f7b5dc5e1f" + } } - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "Nxgc3toYFllR" - }, - "execution_count": null, - "outputs": [] + } } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file