Skip to content

Commit

Permalink
add ftp fallback
Browse files Browse the repository at this point in the history
  • Loading branch information
nebfield committed Jan 16, 2024
1 parent 6033a76 commit 38f1ee0
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/corelib-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
- 'pgscatalog.corelib/**.py'

jobs:
downloadapp-pytest:
downloadapp-corelib:
uses: ./.github/workflows/pytest.yaml
with:
package-directory: "pgscatalog.corelib"
3 changes: 3 additions & 0 deletions pgscatalog.corelib/src/pgscatalog/corelib/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@

API_HEADER = {"user-agent": _package_string}
ROOT_DIR = pathlib.Path(__file__).resolve().parent.parent.parent.parent
MAX_ATTEMPTS = 5
# prevent file downloads from PGS Catalog over HTTPS
FTP_EXCLUSIVE = False
46 changes: 45 additions & 1 deletion pgscatalog.corelib/src/pgscatalog/corelib/scorefiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import pathlib
import tempfile
import urllib

import httpx
import tenacity
Expand All @@ -16,6 +17,42 @@
from pgscatalog.corelib.catalogapi import CatalogQuery, GenomeBuild, ScoreQueryResult


@retry(
stop=tenacity.stop_after_attempt(config.MAX_ATTEMPTS),
retry=tenacity.retry_if_exception_type(IOError),
wait=tenacity.wait_fixed(3) + tenacity.wait_random(0, 2),
)
def _ftp_fallback(retry_state):
"""When ScoringFile.download() fails, it invokes this callback function
Try downloading from PGS Catalog using FTP protocol instead of HTTPS.
"""
scorefile = retry_state.args[0]
directory = retry_state.args[1]

ftp_url = scorefile.path.replace("https://", "ftp://")
checksum_url = (scorefile.path + ".md5").replace("https://", "ftp://")

fn = pathlib.Path(scorefile.path).name
out_path = pathlib.Path(directory) / fn
md5 = hashlib.md5()

with tempfile.NamedTemporaryFile(
dir=directory, delete=False
) as score_f, tempfile.NamedTemporaryFile(dir=directory, delete=True) as checksum_f:
urllib.request.urlretrieve(ftp_url, score_f.name)
urllib.request.urlretrieve(checksum_url, checksum_f.name)

md5.update(score_f.read())

if (checksum := md5.hexdigest()) != (
remote := checksum_f.read().decode().split()[0]
):
raise IOError(f"Local checksum {checksum} doesn't match remote {remote}")
else:
os.rename(score_f.name, out_path)


class ScoringFileHeader:
"""Headers are a way of storing useful metadata about the scoring file. This
header expects a PGS Catalog header format.
Expand Down Expand Up @@ -208,8 +245,10 @@ def _init_from_accession(self, accession, target_build):
self.path = score.get_download_url(target_build)

@retry(
stop=tenacity.stop_after_attempt(5),
stop=tenacity.stop_after_attempt(config.MAX_ATTEMPTS),
retry=tenacity.retry_if_exception_type(httpx.RequestError),
retry_error_callback=_ftp_fallback,
wait=tenacity.wait_fixed(3) + tenacity.wait_random(0, 2),
)
def download(self, directory, overwrite=False):
"""
Expand All @@ -228,6 +267,11 @@ def download(self, directory, overwrite=False):
... print(os.listdir(tmp_dir))
['PGS000001_hmPOS_GRCh38.txt.gz']
"""
if config.FTP_EXCLUSIVE:
# replace wait function to hit callback quickly
self.download.retry.wait = tenacity.wait_none()
raise httpx.RequestError("HTTPS downloads disabled by config.FTP_EXCLUSIVE")

try:
fn = pathlib.Path(self.path).name
out_path = pathlib.Path(directory) / fn
Expand Down

0 comments on commit 38f1ee0

Please sign in to comment.