From 99e84b2c51bb0751bb6b4507a9a895f6de8060ef Mon Sep 17 00:00:00 2001 From: Daniel Date: Fri, 22 Mar 2024 18:11:47 +0800 Subject: [PATCH] bg attempt --- DOTS/dots_feat.ipynb | 84 +++++++++++++++++++++++++++++++++++++ DOTS/scrape.py | 32 ++++++++++++++ DOTS/test/test_dots_feat.py | 6 +-- 3 files changed, 119 insertions(+), 3 deletions(-) diff --git a/DOTS/dots_feat.ipynb b/DOTS/dots_feat.ipynb index ca854ef..1048e88 100644 --- a/DOTS/dots_feat.ipynb +++ b/DOTS/dots_feat.ipynb @@ -1246,6 +1246,90 @@ " rank_articles.append([foreparts,cc])" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## big google news" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.getcwd()\n", + "os.chdir('..')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from scrape import get_related_news\n", + "from datetime import datetime, timedelta" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 0%| | 0/150 [00:00 1\u001b[0m data\u001b[38;5;241m=\u001b[39m\u001b[43mget_related_news\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mfire\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;28;43mmax\u001b[39;49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m150\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/WRK/dcolinmorgan/dots/DOTS/scrape.py:120\u001b[0m, in \u001b[0;36mget_related_news\u001b[0;34m(keyword, max)\u001b[0m\n\u001b[1;32m 118\u001b[0m google_news\u001b[38;5;241m.\u001b[39mstart_date \u001b[38;5;241m=\u001b[39m (start_date\u001b[38;5;241m.\u001b[39myear, start_date\u001b[38;5;241m.\u001b[39mmonth, start_date\u001b[38;5;241m.\u001b[39mday, start_date\u001b[38;5;241m.\u001b[39mhour, start_date\u001b[38;5;241m.\u001b[39mminute, start_date\u001b[38;5;241m.\u001b[39msecond)\n\u001b[1;32m 119\u001b[0m google_news\u001b[38;5;241m.\u001b[39mend_date \u001b[38;5;241m=\u001b[39m (end_date\u001b[38;5;241m.\u001b[39myear, end_date\u001b[38;5;241m.\u001b[39mmonth, end_date\u001b[38;5;241m.\u001b[39mday, end_date\u001b[38;5;241m.\u001b[39mhour, end_date\u001b[38;5;241m.\u001b[39mminute, end_date\u001b[38;5;241m.\u001b[39msecond)\n\u001b[0;32m--> 120\u001b[0m r\u001b[38;5;241m=\u001b[39m\u001b[43mgoogle_news\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_news\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyword\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 121\u001b[0m results\u001b[38;5;241m.\u001b[39mappend(r)\n\u001b[1;32m 123\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdots/input/BG_results.json\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124ma\u001b[39m\u001b[38;5;124m'\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/gnews/gnews.py:246\u001b[0m, in \u001b[0;36mGNews.get_news\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 244\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m%\u001b[39m\u001b[38;5;124m20\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(key\u001b[38;5;241m.\u001b[39msplit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m))\n\u001b[1;32m 245\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124m/search?q=\u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;241m.\u001b[39mformat(key)\n\u001b[0;32m--> 246\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_news\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/gnews/gnews.py:308\u001b[0m, in \u001b[0;36mGNews._get_news\u001b[0;34m(self, query)\u001b[0m\n\u001b[1;32m 306\u001b[0m feed_data \u001b[38;5;241m=\u001b[39m feedparser\u001b[38;5;241m.\u001b[39mparse(url, agent\u001b[38;5;241m=\u001b[39mUSER_AGENT, handlers\u001b[38;5;241m=\u001b[39m[proxy_handler])\n\u001b[1;32m 307\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 308\u001b[0m feed_data \u001b[38;5;241m=\u001b[39m \u001b[43mfeedparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mparse\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magent\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mUSER_AGENT\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 310\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m [item \u001b[38;5;28;01mfor\u001b[39;00m item \u001b[38;5;129;01min\u001b[39;00m\n\u001b[1;32m 311\u001b[0m \u001b[38;5;28mmap\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_process, feed_data\u001b[38;5;241m.\u001b[39mentries[:\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_max_results]) \u001b[38;5;28;01mif\u001b[39;00m item]\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err:\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/feedparser/api.py:216\u001b[0m, in \u001b[0;36mparse\u001b[0;34m(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, response_headers, resolve_relative_uris, sanitize_html)\u001b[0m\n\u001b[1;32m 208\u001b[0m result \u001b[38;5;241m=\u001b[39m FeedParserDict(\n\u001b[1;32m 209\u001b[0m bozo\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 210\u001b[0m entries\u001b[38;5;241m=\u001b[39m[],\n\u001b[1;32m 211\u001b[0m feed\u001b[38;5;241m=\u001b[39mFeedParserDict(),\n\u001b[1;32m 212\u001b[0m headers\u001b[38;5;241m=\u001b[39m{},\n\u001b[1;32m 213\u001b[0m )\n\u001b[1;32m 215\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 216\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[43m_open_resource\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl_file_stream_or_string\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43metag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodified\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreferrer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhandlers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrequest_headers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 217\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m urllib\u001b[38;5;241m.\u001b[39merror\u001b[38;5;241m.\u001b[39mURLError \u001b[38;5;28;01mas\u001b[39;00m error:\n\u001b[1;32m 218\u001b[0m result\u001b[38;5;241m.\u001b[39mupdate({\n\u001b[1;32m 219\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbozo\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;28;01mTrue\u001b[39;00m,\n\u001b[1;32m 220\u001b[0m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mbozo_exception\u001b[39m\u001b[38;5;124m'\u001b[39m: error,\n\u001b[1;32m 221\u001b[0m })\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/feedparser/api.py:115\u001b[0m, in \u001b[0;36m_open_resource\u001b[0;34m(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, result)\u001b[0m\n\u001b[1;32m 111\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m url_file_stream_or_string\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(url_file_stream_or_string, \u001b[38;5;28mstr\u001b[39m) \\\n\u001b[1;32m 114\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m urllib\u001b[38;5;241m.\u001b[39mparse\u001b[38;5;241m.\u001b[39murlparse(url_file_stream_or_string)[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhttp\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhttps\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mftp\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfile\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mfeed\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[0;32m--> 115\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhttp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43murl_file_stream_or_string\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43metag\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodified\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43magent\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreferrer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhandlers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrequest_headers\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mresult\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 117\u001b[0m \u001b[38;5;66;03m# try to open with native open function (if url_file_stream_or_string is a filename)\u001b[39;00m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/site-packages/feedparser/http.py:171\u001b[0m, in \u001b[0;36mget\u001b[0;34m(url, etag, modified, agent, referrer, handlers, request_headers, result)\u001b[0m\n\u001b[1;32m 169\u001b[0m opener \u001b[38;5;241m=\u001b[39m urllib\u001b[38;5;241m.\u001b[39mrequest\u001b[38;5;241m.\u001b[39mbuild_opener(\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mtuple\u001b[39m(handlers \u001b[38;5;241m+\u001b[39m [_FeedURLHandler()]))\n\u001b[1;32m 170\u001b[0m opener\u001b[38;5;241m.\u001b[39maddheaders \u001b[38;5;241m=\u001b[39m [] \u001b[38;5;66;03m# RMK - must clear so we only send our custom User-Agent\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[43mopener\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 172\u001b[0m data \u001b[38;5;241m=\u001b[39m f\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m 173\u001b[0m f\u001b[38;5;241m.\u001b[39mclose()\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/urllib/request.py:515\u001b[0m, in \u001b[0;36mOpenerDirector.open\u001b[0;34m(self, fullurl, data, timeout)\u001b[0m\n\u001b[1;32m 512\u001b[0m req \u001b[38;5;241m=\u001b[39m meth(req)\n\u001b[1;32m 514\u001b[0m sys\u001b[38;5;241m.\u001b[39maudit(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124murllib.Request\u001b[39m\u001b[38;5;124m'\u001b[39m, req\u001b[38;5;241m.\u001b[39mfull_url, req\u001b[38;5;241m.\u001b[39mdata, req\u001b[38;5;241m.\u001b[39mheaders, req\u001b[38;5;241m.\u001b[39mget_method())\n\u001b[0;32m--> 515\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;66;03m# post-process response\u001b[39;00m\n\u001b[1;32m 518\u001b[0m meth_name \u001b[38;5;241m=\u001b[39m protocol\u001b[38;5;241m+\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_response\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/urllib/request.py:532\u001b[0m, in \u001b[0;36mOpenerDirector._open\u001b[0;34m(self, req, data)\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n\u001b[1;32m 531\u001b[0m protocol \u001b[38;5;241m=\u001b[39m req\u001b[38;5;241m.\u001b[39mtype\n\u001b[0;32m--> 532\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_chain\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhandle_open\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprotocol\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mprotocol\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m_open\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result:\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/urllib/request.py:492\u001b[0m, in \u001b[0;36mOpenerDirector._call_chain\u001b[0;34m(self, chain, kind, meth_name, *args)\u001b[0m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m handler \u001b[38;5;129;01min\u001b[39;00m handlers:\n\u001b[1;32m 491\u001b[0m func \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(handler, meth_name)\n\u001b[0;32m--> 492\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 494\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/urllib/request.py:1392\u001b[0m, in \u001b[0;36mHTTPSHandler.https_open\u001b[0;34m(self, req)\u001b[0m\n\u001b[1;32m 1391\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mhttps_open\u001b[39m(\u001b[38;5;28mself\u001b[39m, req):\n\u001b[0;32m-> 1392\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdo_open\u001b[49m\u001b[43m(\u001b[49m\u001b[43mhttp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mHTTPSConnection\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreq\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1393\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_context\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/urllib/request.py:1348\u001b[0m, in \u001b[0;36mAbstractHTTPHandler.do_open\u001b[0;34m(self, http_class, req, **http_conn_args)\u001b[0m\n\u001b[1;32m 1346\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m err: \u001b[38;5;66;03m# timeout error\u001b[39;00m\n\u001b[1;32m 1347\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m URLError(err)\n\u001b[0;32m-> 1348\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43mh\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1349\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[1;32m 1350\u001b[0m h\u001b[38;5;241m.\u001b[39mclose()\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/http/client.py:1423\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1421\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1422\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1423\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1424\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[1;32m 1425\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/http/client.py:331\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 329\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 330\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 331\u001b[0m version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 332\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[1;32m 333\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/http/client.py:292\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 292\u001b[0m line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 293\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/socket.py:707\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 706\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 707\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 708\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 709\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/ssl.py:1252\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1248\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1249\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1250\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m 1251\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1252\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1253\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1254\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv_into(buffer, nbytes, flags)\n", + "File \u001b[0;32m/opt/homebrew/Caskroom/mambaforge/base/envs/DT/lib/python3.12/ssl.py:1104\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1102\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1103\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1105\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1106\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "data=get_related_news('fire',max=150)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/DOTS/scrape.py b/DOTS/scrape.py index a253edf..fccf0fc 100644 --- a/DOTS/scrape.py +++ b/DOTS/scrape.py @@ -2,6 +2,7 @@ from opensearchpy import OpenSearch from dotenv import load_dotenv from tqdm import tqdm +from datetime import datetime, timedelta from gnews import GNews import xml.etree.ElementTree as ET load_dotenv() @@ -104,3 +105,34 @@ def get_npr_news(p): full_story = ' '.join(p.text for p in story) full_stories.append(full_story) return full_stories + + +def get_related_news(keyword:str, max)->list[dict]: + google_news = GNews() + end_date = datetime.now() + start_date = end_date - timedelta(seconds=30) + + results=[] + pbar = tqdm(total=max) + while len(results) < max: + google_news.start_date = (start_date.year, start_date.month, start_date.day, start_date.hour, start_date.minute, start_date.second) + google_news.end_date = (end_date.year, end_date.month, end_date.day, end_date.hour, end_date.minute, end_date.second) + r=google_news.get_news(keyword) + results.append(r) + + with open('dots/input/BG_results.json', 'a') as f: + f.write(json.dumps(r)) + f.write('\n') # Add a newline character for readability + + # Add 30 seconds to start_date and end_date + start_date += timedelta(seconds=30) + end_date += timedelta(seconds=30) + + pbar.update(len(results) - pbar.n) + + pbar.close() + sorted_results= sorted(results, + key=lambda x: datetime.strptime(x['published date'], "%a, %d %b %Y %H:%M:%S %Z"), + reverse=True) + + return sorted_results diff --git a/DOTS/test/test_dots_feat.py b/DOTS/test/test_dots_feat.py index de86dea..3ab1659 100644 --- a/DOTS/test/test_dots_feat.py +++ b/DOTS/test/test_dots_feat.py @@ -1,7 +1,7 @@ import pytest, requests -from dots.feat import chunk_text, featurize_stories -from dots.scrape import get_OS_data, get_google_news, get_massive_OS_data, get_npr_news -from dots.pull import process_hit, process_data, pull_data +from DOTS.feat import chunk_text, featurize_stories +from DOTS.scrape import get_OS_data, get_google_news, get_massive_OS_data, get_npr_news +from DOTS.pull import process_hit, process_data, pull_data from datetime import datetime