Skip to content

Commit 2582bf6

Browse files
committed
added --head_precheck option to test.download
Since the head http verb is inexpensive, this allows for more frequent probing of whether files are potentially still available. If it fails, a proper get request is issued to check the contents. Note that a successful head request does not necessarily mean that the downloadable content is still unchanged; a periodic full download is still needed. However, based on the failures seen up to now, most cases are due to moved files or inaccessible servers, both of which should be identified with the head request. Therefore, this can be done more frequently, so as to identify possible problems sooner.
1 parent 90ffe64 commit 2582bf6

File tree

1 file changed

+20
-1
lines changed

1 file changed

+20
-1
lines changed

test/downloads.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import requests
12
import gzip
23
import io
34
import random
@@ -14,6 +15,9 @@
1415
import ir_datasets
1516

1617

18+
logger = ir_datasets.log.easy()
19+
20+
1721
@contextmanager
1822
def tmp_environ(**kwargs):
1923
orig_values = {}
@@ -36,6 +40,7 @@ class TestDownloads(unittest.TestCase):
3640
output_path = None
3741
rand_delay = None # useful for being nice to servers when running tests by adding a random delay between tests
3842
output_data = []
43+
head_precheck = False
3944

4045
def test_downloads(self):
4146
with open('ir_datasets/etc/downloads.json') as f:
@@ -52,7 +57,9 @@ def test_downloads(self):
5257
for top_key in clir_dlc.keys():
5358
sub_keys = list(clir_dlc[top_key].keys())
5459
for sub_key in random.sample(sub_keys, 10):
55-
self.output_data.append(self._test_download(clir_dlc[top_key][sub_key], f'clirmatrix/{top_key}/{sub_key}'))
60+
res = self._test_download(clir_dlc[top_key][sub_key], f'clirmatrix/{top_key}/{sub_key}')
61+
if res['status'] != 'HEAD_SKIP':
62+
self.output_data.append(res)
5663
finally:
5764
if self.output_path is not None:
5865
with open(self.output_path, 'wt') as f:
@@ -85,6 +92,14 @@ def _test_download(self, data, download_id):
8592
# sleep in range of [0.5, 1.5] * rand_delay seconds
8693
time.sleep(random.uniform(self.rand_delay * 0.5, self.rand_delay * 1.5))
8794
start = time.time()
95+
if self.head_precheck:
96+
try:
97+
requests.head(data['url'], allow_redirects=True).raise_for_status()
98+
logger.info('HEAD request for {url} successful'.format(**data))
99+
record['result'] = 'HEAD_SKIP'
100+
return record # skip if HEAD request passes
101+
except:
102+
logger.info('HEAD request for {url} failed; verifying download'.format(**data))
88103
try:
89104
download = ir_datasets.util.Download([ir_datasets.util.RequestsDownload(data['url'], **data.get('download_args', {}))], expected_md5=data['expected_md5'], stream=True)
90105
with download.stream() as stream:
@@ -120,4 +135,8 @@ def _test_download(self, data, download_id):
120135
if arg == '--randdelay':
121136
TestDownloads.rand_delay = float(argv[i+1])
122137
argv = argv[:i] + argv[i+2:]
138+
for i, arg in enumerate(argv):
139+
if arg == '--head_precheck':
140+
TestDownloads.head_precheck = True
141+
argv = argv[:i] + argv[i+1:]
123142
unittest.main(argv=argv)

0 commit comments

Comments
 (0)