Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Version update 0.0.2: Python 3.11 support, testing module, minor bugfixes, added utilities #104

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
2a77981
Update requirements for compatibility with Python 3.11
narlesky Nov 22, 2023
0b5ea9f
Update how user is prompted to install pdftotext
narlesky Nov 23, 2023
5a51fb5
Update setup.py requirements definition
narlesky Nov 27, 2023
8bb965f
Add initial tests for alert, usace.wcds, cnrfc modules to test python…
narlesky Nov 27, 2023
efe5998
Add placeholder tests
narlesky Nov 27, 2023
9f68b18
Start Python 3.10+ pytz to zoneinfo conversion
narlesky Nov 27, 2023
d49a357
NID adjustments for new calendar year data format; additional test su…
narlesky Nov 28, 2023
e3a0707
Update selenium version, which has tools for automatically handling c…
narlesky Nov 28, 2023
90f42c6
Separate tests into files per resource
narlesky Nov 28, 2023
97915db
Update cvo tests; update area target for certain report/date combinat…
narlesky Nov 28, 2023
81bf081
Update test_dwr
narlesky Nov 28, 2023
446694c
Add test for each utility
narlesky Nov 28, 2023
3f992b9
Extend CNRFC tests
narlesky Nov 28, 2023
27183bc
Updates to cnrfc module and add tests
narlesky Nov 29, 2023
f6c4e6e
CNRFC module updates for stability and testing
narlesky Nov 29, 2023
266aa15
Update test docstrings
narlesky Nov 29, 2023
f10420a
Add cross-version support for localizing naive datetimes
narlesky Nov 29, 2023
193ac85
Update imports in test files
narlesky Nov 29, 2023
ef1fec6
Updates for dwr.swp module compatibility with Python 3.11 and linting
narlesky Nov 30, 2023
9fe8ffc
Add B120 initial tests and related changes in dwr.b120
narlesky Nov 30, 2023
9d6ab62
Add DWR CDEC initial tests; minor changes to argument default types f…
narlesky Nov 30, 2023
5b97812
Remove exit
narlesky Nov 2, 2024
2faf676
Update NID, DWR tests; update installation prompt
narlesky Nov 26, 2024
03dac34
Alert, cnrfc tests updates
narlesky Nov 26, 2024
d328cc6
Update installation instructions
narlesky Nov 26, 2024
a48555a
Expand use of utils.get_session_response; update tests; n-dash to m-d…
narlesky Nov 26, 2024
6fb98a2
Update USACE tests for compat with implemented trimming of get_wcds_d…
narlesky Nov 27, 2024
45cb26d
Rollback utils.get_session_response implementation for memory allocat…
narlesky Nov 27, 2024
f69ec9c
Use lxml in place of html5lib where possible
narlesky Nov 27, 2024
ebb74d0
Update setup configurations
narlesky Nov 27, 2024
6672411
Update toml:
narlesky Nov 27, 2024
28cbf6d
Update installation instructions
narlesky Nov 27, 2024
254f052
Assign timezone info to start/end for WCDS query if not provided
narlesky Nov 27, 2024
9696116
Update utc and tz handling in tests
narlesky Nov 27, 2024
e2f898b
Prefer html.parser in place of lxml for BS4
narlesky Nov 27, 2024
de9cd1e
Bugfix for B120 table parsing to exclude non-data rows
narlesky Dec 4, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Web scraping utilties for DWR, USACE, USGS, CNRFC, CVO and SacALERT data repositories.

## Setup instructions
### Create a virtual environment, specifying Python version 3.8
### Create a virtual environment, specifying Python version 3.11

#### With pure Python (3+)
Create a virtual environment with Python 3's built-in `venv` library.
Expand Down
19 changes: 11 additions & 8 deletions collect/alert/alert.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@

from bs4 import BeautifulSoup, SoupStrainer
import pandas as pd
import requests

from collect import utils


Expand All @@ -33,7 +35,7 @@

def _ustrip(x):
"""
strips whitespace represented by unicode non-breaking space in additon to default white
strips whitespace represented by unicode non-breaking space in addition to default white
space stripping by python's str.strip() method
Arguments:
x (str): string containing an encoded whitespace
Expand Down Expand Up @@ -64,8 +66,9 @@ def get_sites(as_dataframe=True, datatype='stream'):
group_type_id = {'rain': 14, 'stream': 19, 'temperature': 30}.get(datatype)

url = f'https://www.sacflood.org/{measure}?&view_id=1&group_type_id={group_type_id}'
soup = BeautifulSoup(utils.get_session_response(url).text, 'lxml')
df = pd.read_html(str(soup.find('table')))[0]
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
with io.StringIO(str(soup.find('table'))) as text:
df = pd.read_html(text)[0]

# strip whitespace from columns
df.columns = [_ustrip(x) for x in df.columns]
Expand All @@ -87,7 +90,7 @@ def get_sites_from_list(as_dataframe=True, sensor_class=None):
url = 'https://www.sacflood.org/list/'
if sensor_class:
url += '?&sensor_class={}'.format(sensor_class)
soup = BeautifulSoup(utils.get_session_response(url).text, 'lxml')
soup = BeautifulSoup(requests.get(url).text, 'html.parser')

entries = []
for x in soup.find_all('a', {'class': None, 'target': None},
Expand All @@ -114,7 +117,7 @@ def get_site_notes(site_id):
"""
url = f'https://www.sacflood.org/site/?site_id={site_id}'
strainer = SoupStrainer('div', {'class': 'card-body'})
soup = BeautifulSoup(utils.get_session_response(url).text, 'lxml', parse_only=strainer)
soup = BeautifulSoup(requests.get(url).text, 'html.parser', parse_only=strainer)
for card in soup.find_all('div', {'class': 'card-body'}):
if 'Notes' in card.find('h3', {'class': 'card-title'}).text:
notes_block = card.find('p', {'class': 'list-group-item-text'})
Expand All @@ -137,7 +140,7 @@ def get_site_location(site_id):
"""
url = f'https://www.sacflood.org/site/?site_id={site_id}'
result = {'site_id': site_id, 'url': url}
soup = BeautifulSoup(utils.get_session_response(url).text, 'lxml')
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
cards = soup.find_all('div', {'class': 'card-body'})
for card in cards:
if 'Map' in card.find('h3', {'class': 'card-title'}).text:
Expand All @@ -156,7 +159,7 @@ def get_site_sensors(site_id):
"""
url = f'https://www.sacflood.org/site/?site_id={site_id}'
result = {'site_id': site_id, 'url': url, 'sensors': []}
soup = BeautifulSoup(utils.get_session_response(url).text, 'lxml')
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
cards = soup.find_all('div', {'class': 'card-body'})
for card in cards:
if 'Sensors' in card.find('h3', {'class': 'card-title'}).text:
Expand Down Expand Up @@ -206,7 +209,7 @@ def get_query_url(site_id, device_id, start, end):

def get_device_series(site_id, device_id, start, end, ascending=True):
url = get_query_url(site_id, device_id, start, end)
response = io.StringIO(utils.get_session_response(url).text)
response = io.StringIO(requests.get(url).text)
df = pd.read_csv(response)
return df.sort_values(by='Reading', ascending=ascending)

Expand Down
Loading