-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #129 from n1analytics/release-0.8.2
Release 0.8.2
- Loading branch information
Showing
9 changed files
with
177 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import itertools as _itertools | ||
import math as _math | ||
import numbers as _numbers | ||
import typing as _typing | ||
|
||
import mypy_extensions as _mypy_extensions | ||
|
||
|
||
# Future: There may be better ways of chunking. Hamish suggests putting | ||
# a better guarantee on the maximum size of a chunk. This may help with | ||
# optimisation (e.g., set chunk size to be the size of a page, | ||
# eliminating page faults). | ||
# As the function currently makes no guarantees, any such changes would | ||
# be backwards compatible. | ||
|
||
|
||
ChunkInfo = _mypy_extensions.TypedDict( | ||
'ChunkInfo', | ||
{'datasetIndices': _typing.List[int], | ||
'ranges': _typing.List[_typing.List[int]]}) | ||
|
||
|
||
def _split_points(size: int, chunks: int) -> _typing.Iterator[int]: | ||
chunk_size = size / chunks | ||
for i in range(chunks): | ||
yield round(i * chunk_size) | ||
yield size | ||
|
||
|
||
def _chunks_1d( | ||
size: int, | ||
chunks: int | ||
) -> _typing.Iterable[_typing.List[int]]: | ||
split_points = _split_points(size, chunks) | ||
a = next(split_points) | ||
for b in split_points: | ||
yield [a, b] | ||
a = b | ||
|
||
|
||
def split_to_chunks( | ||
chunk_size_aim: _numbers.Real, | ||
*, | ||
# Keyword-only for forwards compatibility: this argument may not be | ||
# needed once we do blocking | ||
dataset_sizes: _typing.Sequence[_numbers.Integral] | ||
) -> _typing.Iterable[ChunkInfo]: | ||
"""Split datasets into chunks for parallel processing. | ||
Resulting chunks are dictionaries with two keys: "datasetIndices" | ||
and "ranges". The value for "datasetIndices" is a length 2 list of | ||
the two datasets that we are comparing in this chunk. The value for | ||
"ranges" is a length 2 list of ranges within those datasets. A range | ||
is a length 2 list [a, b] representing range(a, b). | ||
For example, {"datasetIndices": [2, 4], "ranges": [[3, 21], [18, 20]]} | ||
means that this chunk compares (0-indexed) datasets 2 and 4. We are | ||
looking at elements 3-20 (inclusive) of dataset 2 and elements 18 | ||
and 19 of dataset 4. | ||
The chunks are always JSON serialisable. | ||
:param chunk_size_aim: Number of comparisons per chunk to aim for. | ||
This is a hint only. No promises. | ||
:param datset_sizes: The sizes of the datsets to compare, as a | ||
sequence. | ||
:return: An iterable of chunks. | ||
""" | ||
|
||
# int-like and float-like types such as np.int64 are welcome but are | ||
# not JSON-serialisable. | ||
chunk_size_aim_float = float(chunk_size_aim) | ||
dataset_sizes_int = map(int, dataset_sizes) | ||
for (i0, size0), (i1, size1) in _itertools.combinations( | ||
enumerate(dataset_sizes_int), 2): | ||
if not size0 and not size1: | ||
continue | ||
chunks0 = round(size0 / _math.sqrt(chunk_size_aim_float)) or 1 | ||
chunk_size0 = size0 / chunks0 | ||
# chunk_size0 is unlikely to be exactly sqrt(chunk_size_aim). | ||
# Adjust goal chunk size for the second dataset. | ||
chunks1 = round(size1 * chunk_size0 / chunk_size_aim_float) or 1 | ||
for c0, c1 in _itertools.product( | ||
_chunks_1d(size0, chunks0), _chunks_1d(size1, chunks1)): | ||
yield {'datasetIndices': [i0, i1], 'ranges': [c0, c1]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,4 @@ cffi>=1.7 | |
pytest>=3.4 | ||
pytest-cov>=2.5 | ||
clkhash==0.10.1 | ||
mypy-extensions==0.3.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import itertools | ||
|
||
import pytest | ||
|
||
from anonlink import concurrency | ||
|
||
DATASET_SIZES = (0, 1, 100) | ||
DATASET_NUMS = (0, 1, 2, 3) | ||
DATASETS = tuple(itertools.chain.from_iterable( | ||
itertools.product(DATASET_SIZES, repeat=n) for n in DATASET_NUMS)) | ||
CHUNK_SIZE_AIMS = (1, 10, 100) | ||
|
||
|
||
|
||
@pytest.mark.parametrize('datasets', DATASETS) | ||
@pytest.mark.parametrize('chunk_size_aim', CHUNK_SIZE_AIMS) | ||
def test_chunk_size(datasets, chunk_size_aim): | ||
# Guarantee: chunk_size_aim / 4 < chunk_size < chunk_size_aim * 4. | ||
# It may be possible to prove a better bound. | ||
chunks = concurrency.split_to_chunks(chunk_size_aim, | ||
dataset_sizes=datasets) | ||
for chunk in chunks: | ||
size = 1 | ||
i0, i1 = chunk['datasetIndices'] | ||
for a, b in chunk['ranges']: | ||
assert a <= b | ||
size *= b - a | ||
assert (chunk_size_aim / 4 < size | ||
or 4 * chunk_size_aim > datasets[i0] * datasets[i1]) | ||
assert size < chunk_size_aim * 4 | ||
|
||
|
||
|
||
@pytest.mark.parametrize('datasets', DATASETS) | ||
@pytest.mark.parametrize('chunk_size_aim', CHUNK_SIZE_AIMS) | ||
def test_comparison_coverage(datasets, chunk_size_aim): | ||
all_comparisons = set() | ||
for (i0, s0), (i1, s1) in itertools.combinations(enumerate(datasets), 2): | ||
for j0, j1 in itertools.product(range(s0), range(s1)): | ||
all_comparisons.add((i0, i1, j0, j1)) | ||
chunks = concurrency.split_to_chunks(chunk_size_aim, | ||
dataset_sizes=datasets) | ||
for chunk in chunks: | ||
i0, i1 = chunk['datasetIndices'] | ||
r0, r1 = chunk['ranges'] | ||
for j0, j1 in itertools.product(range(*r0), range(*r1)): | ||
# This will raise KeyError if we have duplicates | ||
all_comparisons.remove((i0, i1, j0, j1)) | ||
# Make sure we've touched everything (so our set is empty) | ||
assert not all_comparisons |