From 0c425eebcff9e966340a6d84a16189e69e4b82d3 Mon Sep 17 00:00:00 2001 From: McKenzie Kulseth <73662960+mkulseth@users.noreply.github.com> Date: Mon, 1 Feb 2021 11:23:29 -0600 Subject: [PATCH] Altered the Code Altered the code to be more efficient and reliable and repeatable. --- Web-Scrapping-Copy1.ipynb | 322 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 322 insertions(+) create mode 100644 Web-Scrapping-Copy1.ipynb diff --git a/Web-Scrapping-Copy1.ipynb b/Web-Scrapping-Copy1.ipynb new file mode 100644 index 0000000..dc1fe3d --- /dev/null +++ b/Web-Scrapping-Copy1.ipynb @@ -0,0 +1,322 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

Web Scrapping Python Script

\n", + "

By: McKenzie Kulseth



\n", + "

Purpose of this code:

\n", + "

This code will parse a webpage for data and extract that data. \n", + "

NOTE: One should orgainze the csv files being save into a directory tree to better organize the files.

\n", + "


1.) Import Libraries

\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import requests as req; import lxml; from bs4 import BeautifulSoup; import csv;\n", + "import numpy as np; import pandas as pd; from datetime import *; import threading as th; import os" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "


2.) Get the station ids for the sounding stations from a website\n", + "

\n", + "

" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 3460 in the United States.\n" + ] + } + ], + "source": [ + "station_ids = []\n", + "with open(\"stations_2.csv\") as myfile:\n", + " myreader = csv.reader(myfile,delimiter=\",\")\n", + " row_count = 0\n", + " for row in myreader:\n", + " if row_count != 0 and len(row) != 0:\n", + " station_ids.append(row[0])\n", + " row_count += 1\n", + "print(\"There are \"+str(len(station_ids))+\" in the United States.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "


3.) Creating lists to hold the dates to be looped through to find sounding data.\n", + "

" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done creating years, months, and days to loop through for getting data.\n" + ] + } + ], + "source": [ + "today = date.today()\n", + "this_year = int(today.strftime(\"%Y\"))\n", + "years = np.arange(1990,this_year)\n", + "months = ['01','02','03','04','05','06','07','08','09','10','11','12']\n", + "print(\"Done creating years, months, and days to loop through for getting data.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a function that checks for data in website\n", + "class SoundingData:\n", + " def __init__(self, station_id=\"MPX\", yyyy=2020, mm=12):\n", + " self.station_id = station_id\n", + " self.year = str(yyyy)\n", + " self.month = str(mm)\n", + " self.day1 = \"01\"\n", + " self.day2 = \"31\"\n", + " if self.month == \"02\" and int(self.year)%4 == 0:\n", + " self.day2 = \"29\"\n", + " elif self.month == \"02\" and int(self.year)%4 != 0:\n", + " self.day2 = \"28\"\n", + " elif self.month not in [\"01\",\"03\",\"05\",\"07\",\"08\",\"10\",\"12\"]:\n", + " self.day2 = \"30\"\n", + " \n", + " self.websites = [\"http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=\"+self.year+\"&MONTH=\"+self.month+\"&FROM=\"+self.day1+\"00&TO=\"+self.day2+\"12&STNM=\"+self.station_id]\n", + " \n", + " def GetSoundingData(self):\n", + " data_websites = []\n", + " # Read the website(s) using bs4\n", + " for website in self.websites:\n", + " data_websites.append(website)\n", + "\n", + " return data_websites" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=02&FROM=0100&TO=2912&STNM=MPX']\n" + ] + } + ], + "source": [ + "# Test the function DataCheck\n", + "testing_Data1 = SoundingData(yyyy=2020,mm=\"02\")\n", + "testing_DataCheck = testing_Data1.GetSoundingData()\n", + "print(testing_DataCheck)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "


4.) Loop through those dates and station ids to create a list of website to be parsed for sounding data

" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Appending websites to a list to be looped through when finding sounding data.\n", + "\n", + "Done.\n", + "There are 1287120 to sift through.\n" + ] + } + ], + "source": [ + "print(\"Appending websites to a list to be looped through when finding sounding data.\")\n", + "web_sites= []; count = 0\n", + "for y in np.flipud(years):\n", + " for m in np.flipud(months):\n", + " for ids in station_ids:\n", + " Sounding_Data = SoundingData(ids,y,m)\n", + " websites = Sounding_Data.GetSoundingData()\n", + " for website in websites:\n", + " web_sites.append(website)\n", + "print(\"\\nDone.\\nThere are \"+str(len(web_sites))+\" to sift through.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "


5.) Get sounding data from websites\n", + "

  • Loop through websites\n", + "
  • Store sounding data in dictionary\n", + "
  • Create a panadas dataframe to store the dictionary\n", + "
  • Save the dataframe as a csv file\n", + "
\n", + "

" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtest_site\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mweb_sites\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;31m# test_site = \"http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=12&FROM=0100&TO=3112&STNM=BMX\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0msource\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mreq\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mtest_site\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 4\u001b[0m \u001b[0msoup\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mBeautifulSoup\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msource\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m\"lxml\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0msoup\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"pre\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\api.py\u001b[0m in \u001b[0;36mget\u001b[1;34m(url, params, **kwargs)\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 75\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'allow_redirects'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 76\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'get'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 77\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 78\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\api.py\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(method, url, **kwargs)\u001b[0m\n\u001b[0;32m 59\u001b[0m \u001b[1;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 60\u001b[0m \u001b[1;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 61\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 62\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 63\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[0;32m 540\u001b[0m }\n\u001b[0;32m 541\u001b[0m \u001b[0msend_kwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 542\u001b[1;33m \u001b[0mresp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 543\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 544\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\sessions.py\u001b[0m in \u001b[0;36msend\u001b[1;34m(self, request, **kwargs)\u001b[0m\n\u001b[0;32m 653\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 654\u001b[0m \u001b[1;31m# Send the request\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 655\u001b[1;33m \u001b[0mr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 656\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 657\u001b[0m \u001b[1;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\requests\\adapters.py\u001b[0m in \u001b[0;36msend\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m 437\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 438\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mchunked\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 439\u001b[1;33m resp = conn.urlopen(\n\u001b[0m\u001b[0;32m 440\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 441\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m 697\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 698\u001b[0m \u001b[1;31m# Make the request on the httplib connection object.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 699\u001b[1;33m httplib_response = self._make_request(\n\u001b[0m\u001b[0;32m 700\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 701\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 443\u001b[0m \u001b[1;31m# Python 3 (including for exceptions like SystemExit).\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 444\u001b[0m \u001b[1;31m# Otherwise it looks like a bug in the code.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 445\u001b[1;33m \u001b[0msix\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mraise_from\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 446\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mSocketError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 447\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_raise_timeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0merr\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtimeout_value\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mread_timeout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\urllib3\\packages\\six.py\u001b[0m in \u001b[0;36mraise_from\u001b[1;34m(value, from_value)\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m 438\u001b[0m \u001b[1;31m# Python 3\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 439\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 440\u001b[1;33m \u001b[0mhttplib_response\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetresponse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 441\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 442\u001b[0m \u001b[1;31m# Remove the TypeError from the exception chain in\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36mgetresponse\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1345\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1346\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1347\u001b[1;33m \u001b[0mresponse\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mbegin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1348\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1349\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36mbegin\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 305\u001b[0m \u001b[1;31m# read until we get a non-100 response\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 306\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 307\u001b[1;33m \u001b[0mversion\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatus\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mreason\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_read_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 308\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mstatus\u001b[0m \u001b[1;33m!=\u001b[0m \u001b[0mCONTINUE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 309\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\http\\client.py\u001b[0m in \u001b[0;36m_read_status\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 266\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 267\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_read_status\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 268\u001b[1;33m \u001b[0mline\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreadline\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_MAXLINE\u001b[0m \u001b[1;33m+\u001b[0m \u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"iso-8859-1\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 269\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mline\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m>\u001b[0m \u001b[0m_MAXLINE\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 270\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0mLineTooLong\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"status line\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\socket.py\u001b[0m in \u001b[0;36mreadinto\u001b[1;34m(self, b)\u001b[0m\n\u001b[0;32m 667\u001b[0m \u001b[1;32mwhile\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 668\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 669\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrecv_into\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 670\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 671\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_timeout_occurred\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "for test_site in web_sites:\n", + "# test_site = \"http://weather.uwyo.edu/cgi-bin/sounding?region=naconf&TYPE=TEXT%3ALIST&YEAR=2020&MONTH=12&FROM=0100&TO=3112&STNM=BMX\"\n", + " source = req.get(test_site).text\n", + " soup = BeautifulSoup(source,\"lxml\")\n", + " data = soup.find_all(\"pre\")\n", + " year = ''; month = ''; station_id = '';\n", + " if test_site[-3] != \"=\":\n", + " year = test_site[-40:-36]\n", + " month = test_site[-29:-27]\n", + " station_id = test_site[-3:]\n", + " else:\n", + " year = test_site[-39:-35]\n", + " month = test_site[-28:-26]\n", + " station_id = test_site[-2:]\n", + " if len(data) > 0:\n", + " for d in range(1,len(data),2):\n", + " d1 = data[d].text.split(\"\\n\")\n", + " day = d1[3].split(\":\")[1].split(\"/\")[0][-2:]; hour = d1[3].split(\":\")[1].split(\"/\")[1];\n", + "\n", + " file_1 = station_id+\"_\"+year+\"_\"+month+\"_\"+day+\"_\"+hour+\".csv\"\n", + "\n", + " head = [d2 for d2 in data[d-1].text.split(\"\\n\")[2].split(\" \") if d2 != '' ]\n", + " head_1 = []\n", + " units = [d2 for d2 in data[d-1].text.split(\"\\n\")[3].split(\" \") if d2 != '' ]\n", + " data1 = [[np.nan for z in head] for d2 in data[d-1].text.split(\"\\n\")[5:]]\n", + " data2 = []\n", + " count = 0\n", + " for d1a in data[d-1].text.split(\"\\n\")[5:]:\n", + " d1b = d1a.split(\" \")\n", + " count1 = 0\n", + " for d1c in d1b:\n", + " if d1c != '':\n", + " data1[count][count1] = d1c\n", + " count1 += 1\n", + " count += 1\n", + " flag = [np.nan for z in head]\n", + " while flag in data1:\n", + " data1.remove(flag)\n", + " count = 0\n", + " for z in [data[d].text.split(\"\\n\")[1:-2][x].split(\":\") for x in range(len(data[d].text.split(\"\\n\")[1:-2]))]:\n", + " z1 = z[0].split(\" \")\n", + " data2.append(z[1])\n", + " flag = \"\"\n", + " while flag in z1:\n", + " z1.remove(flag)\n", + " tz = \"\"\n", + " zz = []\n", + " for z2 in range(len(z1)):\n", + " if z2 == 0:\n", + " tz += z1[z2]\n", + " else:\n", + " tz += \"_\"+z1[z2]\n", + " head.append(tz)\n", + " count += 1\n", + " for z in data1:\n", + " for k in data2:\n", + " z.append(k)\n", + " \n", + " mydf1 = pd.DataFrame(data = data1,columns=head)\n", + " topdir = \"\"\n", + " if not os.path.exists(year+\"/\"+month+\"/\"+day):\n", + " os.makedirs(year+\"/\"+month+\"/\"+day)\n", + " topdir = year+\"/\"+month+\"/\"+day+\"/\"\n", + " else:\n", + " topdir = year+\"/\"+month+\"/\"+day+\"/\"\n", + " mydf1.to_csv(topdir+file_1,na_rep=np.nan)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}