-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
To alleviate the number of requests we send to ERDDAP, I've added a caching layer around outgoing HTTP requests for JSON and CSV documents. I will be improving this in the future.
- Loading branch information
1 parent
efaf741
commit a3fe46d
Showing
7 changed files
with
296 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ dependencies: | |
- pandas | ||
- erddapy | ||
- panel | ||
- appdirs | ||
- intake | ||
- intake-xarray>=0.6.1 | ||
- pip | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
"""Caching support.""" | ||
import gzip | ||
import json | ||
import pandas as pd | ||
import time | ||
from typing import Optional, Union, Any, Type | ||
import appdirs | ||
import hashlib | ||
import requests | ||
from pathlib import Path | ||
|
||
|
||
class CacheStore: | ||
"""A caching mechanism to store HTTP responses in a local cache.""" | ||
|
||
def __init__(self, cache_dir: Optional[Path] = None, http_client: Optional[Type] = None, cache_period: Optional[Union[int, float]] = None): | ||
self.cache_dir: Path = cache_dir or Path(appdirs.user_cache_dir("intake-erddap", "axds")) | ||
self.http_client = http_client or requests | ||
self.cache_period = cache_period or 500. | ||
|
||
if not self.cache_dir.exists(): | ||
self.cache_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
@staticmethod | ||
def hash_url(url: str) -> str: | ||
"""Returns the hash of the URL""" | ||
return hashlib.sha256(url.encode("utf-8")).hexdigest() | ||
|
||
def cache_file(self, url: str) -> Path: | ||
"""Return the path to the cache file.""" | ||
checksum = self.hash_url(url) | ||
filename = self.cache_dir / f'{checksum}.gz' | ||
return filename | ||
|
||
def cache_response(self, url: str, *args, **kwargs): | ||
"""Write the content of the HTTP response to a gzipped cached file.""" | ||
filename = self.cache_file(url) | ||
with gzip.open(filename, "wb") as f: | ||
resp = self.http_client.get(url, *args, **kwargs) | ||
resp.raise_for_status() | ||
f.write(resp.content) | ||
|
||
def read_csv(self, url: str, pandas_kwargs: Optional[dict] = None, http_kwargs: Optional[dict] = None) -> pd.DataFrame: | ||
"""Return a pandas data frame read from source or cache.""" | ||
pandas_kwargs = pandas_kwargs or {} | ||
http_kwargs = http_kwargs or {} | ||
pth = self.cache_file(url) | ||
now = time.time() | ||
allowed_mtime = now - self.cache_period | ||
if pth.exists(): | ||
if pth.stat().st_mtime < allowed_mtime: | ||
print("Cache MISS") | ||
self.cache_response(url, **http_kwargs) | ||
else: | ||
print("Cache HIT") | ||
else: | ||
print("Cache MISS") | ||
self.cache_response(url, **http_kwargs) | ||
|
||
with gzip.open(pth) as f: | ||
return pd.read_csv(f, **pandas_kwargs) | ||
|
||
def read_json(self, url: str, http_kwargs: Optional[dict] = None) -> Any: | ||
http_kwargs = http_kwargs or {} | ||
pth = self.cache_file(url) | ||
now = time.time() | ||
allowed_mtime = now - self.cache_period | ||
if pth.exists(): | ||
if pth.stat().st_mtime < allowed_mtime: | ||
print("Cache MISS") | ||
self.cache_response(url, **http_kwargs) | ||
else: | ||
print("Cache HIT") | ||
else: | ||
print("Cache MISS") | ||
self.cache_response(url, **http_kwargs) | ||
|
||
with gzip.open(pth) as f: | ||
return json.load(f) | ||
|
||
def clear_cache(self, mtime: Optional[Union[int, float]] = None): | ||
"""Removes all cached files.""" | ||
if self.cache_dir.exists(): | ||
if mtime is None: | ||
self._clear_cache() | ||
else: | ||
self._clear_cache_mtime(mtime) | ||
|
||
def _clear_cache(self): | ||
"""Removes all cached files.""" | ||
for cache_file in self.cache_dir.glob('*.gz'): | ||
cache_file.unlink() | ||
|
||
def _clear_cache_mtime(self, age: Union[int, float]): | ||
"""Removes cached files older than ``age`` seconds.""" | ||
current_time = time.time() | ||
cutoff = current_time - age | ||
for cache_file in self.cache_dir.glob('*.gz'): | ||
mtime = cache_file.stat().st_mtime | ||
if mtime <= cutoff: | ||
cache_file.unlink() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
appdirs | ||
erddapy | ||
intake | ||
intake-xarray | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
#!/usr/bin/env pytest | ||
"""Unit tests for caching support.""" | ||
import time | ||
import gzip | ||
from pathlib import Path | ||
import hashlib | ||
from unittest import mock | ||
from intake_erddap import cache | ||
import pytest | ||
import tempfile | ||
import shutil | ||
import os | ||
|
||
|
||
@pytest.fixture | ||
def tempdir(): | ||
tempdir = tempfile.mkdtemp() | ||
yield tempdir | ||
if os.path.exists(tempdir): | ||
shutil.rmtree(tempdir) | ||
|
||
|
||
@mock.patch('appdirs.user_cache_dir') | ||
def test_cache_file(user_cache_dir_mock, tempdir): | ||
user_cache_dir_mock.return_value = tempdir | ||
url = "http://kevinbacon.invalid/erddap/advanced?blahbah" | ||
store = cache.CacheStore() | ||
filepath = store.cache_file(url) | ||
assert filepath.parent == Path(tempdir) | ||
sha = cache.CacheStore.hash_url(url) | ||
assert filepath.name == f'{sha}.gz' | ||
|
||
|
||
@mock.patch('requests.get') | ||
@mock.patch('appdirs.user_cache_dir') | ||
def test_cache_csv(user_cache_dir_mock, http_get_mock, tempdir): | ||
user_cache_dir_mock.return_value = tempdir | ||
resp = mock.Mock() | ||
resp.content = b'blahblah' | ||
http_get_mock.return_value = resp | ||
url = "http://kevinbacon.invalid/erddap/advanced?blahbah" | ||
store = cache.CacheStore() | ||
store.cache_response(url) | ||
sha = store.hash_url(url) | ||
target = (Path(tempdir) / f'{sha}.gz') | ||
assert target.exists() | ||
assert http_get_mock.called_with(url) | ||
with gzip.open(target, "rt", encoding="utf-8") as f: | ||
buf = f.read() | ||
assert buf == "blahblah" | ||
|
||
|
||
@mock.patch('requests.get') | ||
@mock.patch('appdirs.user_cache_dir') | ||
def test_clearing_cache(user_cache_dir_mock, http_get_mock, tempdir): | ||
user_cache_dir_mock.return_value = tempdir | ||
resp = mock.Mock() | ||
resp.content = b'blahblah' | ||
http_get_mock.return_value = resp | ||
url = "http://kevinbacon.invalid/erddap/advanced?blahbah" | ||
store = cache.CacheStore() | ||
store.cache_response(url) | ||
sha = store.hash_url(url) | ||
target = (Path(tempdir) / f'{sha}.gz') | ||
|
||
store.clear_cache() | ||
assert not target.exists() | ||
store.cache_response(url) | ||
assert target.exists() | ||
|
||
# Clear cached files older than 100 s. The file we just created is brand new so should remain. | ||
store.clear_cache(100) | ||
assert target.exists() | ||
|
||
# Now change the mtime of the file to be 500 s old | ||
now = time.time() | ||
os.utime(target, (now - 500, now - 500)) | ||
store.clear_cache(100) | ||
assert not target.exists() | ||
|
||
|
||
@mock.patch('appdirs.user_cache_dir') | ||
def test_cache_no_dir(user_cache_dir_mock, tempdir): | ||
"""Tests that the cache store will create the cache dir if it doesn't exist.""" | ||
user_cache_dir_mock.return_value = tempdir | ||
tempdir = Path(tempdir) | ||
tempdir.rmdir() | ||
assert not tempdir.exists() | ||
cache.CacheStore() | ||
assert tempdir.exists() | ||
|
||
|
||
@mock.patch('requests.get') | ||
@mock.patch('appdirs.user_cache_dir') | ||
def test_cache_read_csv(user_cache_dir_mock, http_get_mock, tempdir): | ||
user_cache_dir_mock.return_value = tempdir | ||
resp = mock.Mock() | ||
http_get_mock.return_value = resp | ||
resp.content = b"col_a,col_b\n1,blue\n2,red\n" | ||
store = cache.CacheStore() | ||
url = "http://blah.invalid/erddap/search?q=bacon+egg+and+cheese" | ||
df = store.read_csv(url) | ||
assert len(df) == 2 | ||
filepath = store.cache_file(url) | ||
with gzip.open(filepath, "wb") as f: | ||
f.write(b"col_a,col_b\n3,green\n4,yellow\n") | ||
df = store.read_csv(url) | ||
assert df['col_a'].tolist() == [3, 4] | ||
assert df['col_b'].tolist() == ["green", "yellow"] | ||
|
||
# Force a cache miss | ||
now = time.time() | ||
os.utime(filepath, (now - 1000, now - 1000)) | ||
df = store.read_csv(url) | ||
assert df['col_a'].tolist() == [1, 2] | ||
assert df['col_b'].tolist() == ["blue", "red"] | ||
|
||
|
||
@mock.patch('requests.get') | ||
@mock.patch('appdirs.user_cache_dir') | ||
def test_cache_read_json(user_cache_dir_mock, http_get_mock, tempdir): | ||
user_cache_dir_mock.return_value = tempdir | ||
resp = mock.Mock() | ||
http_get_mock.return_value = resp | ||
resp.content = b'{"key":"value", "example": "blah"}' | ||
store = cache.CacheStore() | ||
url = "http://blah.invalid/erddap/search?q=bacon+egg+and+cheese" | ||
data = store.read_json(url) | ||
assert data == {'key': 'value', 'example': 'blah'} | ||
filepath = store.cache_file(url) | ||
with gzip.open(filepath, "wb") as f: | ||
f.write(b'{"different": "is different"}') | ||
data = store.read_json(url) | ||
assert data["different"] == "is different" | ||
|
||
# Force a cache miss | ||
now = time.time() | ||
os.utime(filepath, (now - 1000, now - 1000)) | ||
data = store.read_json(url) | ||
assert data == {'key': 'value', 'example': 'blah'} |
Oops, something went wrong.