Skip to content

Commit

Permalink
Merge pull request #129 from n1analytics/release-0.8.2
Browse files Browse the repository at this point in the history
Release 0.8.2
  • Loading branch information
hardbyte authored Jul 25, 2018
2 parents aebdbba + d505ee6 commit cffabb3
Show file tree
Hide file tree
Showing 9 changed files with 177 additions and 16 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
0.8.2
-----

Fix discrepancies between Python and C++ versions #102
Utility added to ``anonlink/concurrency.py`` help with chunking.
Better Github status messages posted by jenkins.

0.8.1
-----

Expand Down
28 changes: 17 additions & 11 deletions Jenkinsfile.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ def isDevelop = env.BRANCH_NAME == 'develop'

VENV_DIRECTORY = "env"

GIT_CONTEXT = "jenkins"
GITHUB_TEST_CONTEXT = "jenkins/test"
GITHUB_RELEASE_CONTEXT = "jenkins/release"

def configs = [
[label: 'GPU 1', pythons: ['python3.4', 'python3.5', 'python3.6'], compilers: ['clang', 'gcc']],
//[label: 'osx', pythons: ['python3.5'], compilers: ['clang', 'gcc']]
[label: 'McNode', pythons: ['python3.5'], compilers: ['clang']]
[os: 'linux', pythons: ['python3.4', 'python3.5', 'python3.6'], compilers: ['clang', 'gcc']],
[os: 'osx', pythons: ['python3.6', 'python3.7'], compilers: ['clang']]
]

def PythonVirtualEnvironment prepareVirtualEnvironment(String pythonVersion, clkhashPackageName, compiler, venv_directory = VENV_DIRECTORY) {
Expand Down Expand Up @@ -77,7 +77,7 @@ def build(python_version, compiler, label, release = false) {

def builders = [:]
for (config in configs) {
def label = config["label"]
def os = config["os"]
def pythons = config["pythons"]
def compilers = config["compilers"]

Expand All @@ -86,7 +86,9 @@ for (config in configs) {

def py_version = _py_version
def compiler = _compiler
def combinedName = "${label}-${py_version}-${compiler}"

def label = "$os&&$py_version&&$compiler"
def combinedName = "${os} ${compiler} ${py_version}"

builders[combinedName] = {
node(label) {
Expand All @@ -102,26 +104,30 @@ for (config in configs) {
GitCommit commit;
node() {
commit = GitUtils.checkoutFromSCM(this);
commit.setInProgressStatus(GIT_CONTEXT);
commit.setInProgressStatus(GITHUB_TEST_CONTEXT);
}

try {
parallel builders
node() {
commit.setSuccessStatus(GITHUB_TEST_CONTEXT)
}
} catch (Exception err) {
node() {
commit.setFailStatus("Build failed", GIT_CONTEXT);
commit.setFailStatus("Build failed", GITHUB_TEST_CONTEXT);
}
throw err
}

node('GPU 1') {
stage('Release') {
try {
commit.setInProgressStatus(GITHUB_RELEASE_CONTEXT);
build('python3.5', 'gcc', 'GPU 1', true)
commit.setSuccessStatus(GIT_CONTEXT)
commit.setSuccessStatus(GITHUB_RELEASE_CONTEXT)
} catch (Exception e) {
commit.setFailStatus("Release failed", GIT_CONTEXT);
commit.setFailStatus("Release failed", GITHUB_RELEASE_CONTEXT);
throw e;
}
}
}
}
16 changes: 13 additions & 3 deletions _cffi_build/dice_one_against_many.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ class Node {

struct score_cmp {
bool operator()(const Node& a, const Node& b) const {
return a.score >= b.score;
return a.score > b.score || (a.score == b.score && a.index < b.index);
}
};

Expand Down Expand Up @@ -465,8 +465,18 @@ extern "C"
node_queue top_k_scores(score_cmp(), std::move(vec));

uint32_t count_one = _popcount_array(comp1, keywords);
if (count_one == 0)
return 0;
if (count_one == 0) {
if (threshold > 0) {
return 0;
}

for (uint32_t j = 0; j < k; ++j) {
scores[j] = 0.0;
indices[j] = j;
}

return static_cast<int>(k);
}

uint32_t max_popcnt_delta = keybytes * CHAR_BIT; // = bits per key
if(threshold > 0) {
Expand Down
1 change: 1 addition & 0 deletions anonlink/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pkg_resources

from anonlink import bloommatcher
from anonlink import concurrency
from anonlink import entitymatch
from anonlink import network_flow

Expand Down
86 changes: 86 additions & 0 deletions anonlink/concurrency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import itertools as _itertools
import math as _math
import numbers as _numbers
import typing as _typing

import mypy_extensions as _mypy_extensions


# Future: There may be better ways of chunking. Hamish suggests putting
# a better guarantee on the maximum size of a chunk. This may help with
# optimisation (e.g., set chunk size to be the size of a page,
# eliminating page faults).
# As the function currently makes no guarantees, any such changes would
# be backwards compatible.


ChunkInfo = _mypy_extensions.TypedDict(
'ChunkInfo',
{'datasetIndices': _typing.List[int],
'ranges': _typing.List[_typing.List[int]]})


def _split_points(size: int, chunks: int) -> _typing.Iterator[int]:
chunk_size = size / chunks
for i in range(chunks):
yield round(i * chunk_size)
yield size


def _chunks_1d(
size: int,
chunks: int
) -> _typing.Iterable[_typing.List[int]]:
split_points = _split_points(size, chunks)
a = next(split_points)
for b in split_points:
yield [a, b]
a = b


def split_to_chunks(
chunk_size_aim: _numbers.Real,
*,
# Keyword-only for forwards compatibility: this argument may not be
# needed once we do blocking
dataset_sizes: _typing.Sequence[_numbers.Integral]
) -> _typing.Iterable[ChunkInfo]:
"""Split datasets into chunks for parallel processing.
Resulting chunks are dictionaries with two keys: "datasetIndices"
and "ranges". The value for "datasetIndices" is a length 2 list of
the two datasets that we are comparing in this chunk. The value for
"ranges" is a length 2 list of ranges within those datasets. A range
is a length 2 list [a, b] representing range(a, b).
For example, {"datasetIndices": [2, 4], "ranges": [[3, 21], [18, 20]]}
means that this chunk compares (0-indexed) datasets 2 and 4. We are
looking at elements 3-20 (inclusive) of dataset 2 and elements 18
and 19 of dataset 4.
The chunks are always JSON serialisable.
:param chunk_size_aim: Number of comparisons per chunk to aim for.
This is a hint only. No promises.
:param datset_sizes: The sizes of the datsets to compare, as a
sequence.
:return: An iterable of chunks.
"""

# int-like and float-like types such as np.int64 are welcome but are
# not JSON-serialisable.
chunk_size_aim_float = float(chunk_size_aim)
dataset_sizes_int = map(int, dataset_sizes)
for (i0, size0), (i1, size1) in _itertools.combinations(
enumerate(dataset_sizes_int), 2):
if not size0 and not size1:
continue
chunks0 = round(size0 / _math.sqrt(chunk_size_aim_float)) or 1
chunk_size0 = size0 / chunks0
# chunk_size0 is unlikely to be exactly sqrt(chunk_size_aim).
# Adjust goal chunk size for the second dataset.
chunks1 = round(size1 * chunk_size0 / chunk_size_aim_float) or 1
for c0, c1 in _itertools.product(
_chunks_1d(size0, chunks0), _chunks_1d(size1, chunks1)):
yield {'datasetIndices': [i0, i1], 'ranges': [c0, c1]}
2 changes: 1 addition & 1 deletion anonlink/entitymatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def dicecoeff(x):

coeffs = filter(lambda c: c[1] >= threshold,
enumerate(map(dicecoeff, filters2)))
top_k = sorted(coeffs, key=itemgetter(1), reverse=True)[:k]
top_k = sorted(coeffs, key=lambda x: -x[1])[:k]
result.extend([(i, coeff, j) for j, coeff in top_k])
return result

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ cffi>=1.7
pytest>=3.4
pytest-cov>=2.5
clkhash==0.10.1
mypy-extensions==0.3.0
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name="anonlink",
version='0.8.1',
version='0.8.2',
description='Anonymous linkage using cryptographic hashes and bloom filters',
url='https://github.com/n1analytics/anonlink',
license='Apache',
Expand Down
50 changes: 50 additions & 0 deletions tests/test_concurrency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import itertools

import pytest

from anonlink import concurrency

DATASET_SIZES = (0, 1, 100)
DATASET_NUMS = (0, 1, 2, 3)
DATASETS = tuple(itertools.chain.from_iterable(
itertools.product(DATASET_SIZES, repeat=n) for n in DATASET_NUMS))
CHUNK_SIZE_AIMS = (1, 10, 100)



@pytest.mark.parametrize('datasets', DATASETS)
@pytest.mark.parametrize('chunk_size_aim', CHUNK_SIZE_AIMS)
def test_chunk_size(datasets, chunk_size_aim):
# Guarantee: chunk_size_aim / 4 < chunk_size < chunk_size_aim * 4.
# It may be possible to prove a better bound.
chunks = concurrency.split_to_chunks(chunk_size_aim,
dataset_sizes=datasets)
for chunk in chunks:
size = 1
i0, i1 = chunk['datasetIndices']
for a, b in chunk['ranges']:
assert a <= b
size *= b - a
assert (chunk_size_aim / 4 < size
or 4 * chunk_size_aim > datasets[i0] * datasets[i1])
assert size < chunk_size_aim * 4



@pytest.mark.parametrize('datasets', DATASETS)
@pytest.mark.parametrize('chunk_size_aim', CHUNK_SIZE_AIMS)
def test_comparison_coverage(datasets, chunk_size_aim):
all_comparisons = set()
for (i0, s0), (i1, s1) in itertools.combinations(enumerate(datasets), 2):
for j0, j1 in itertools.product(range(s0), range(s1)):
all_comparisons.add((i0, i1, j0, j1))
chunks = concurrency.split_to_chunks(chunk_size_aim,
dataset_sizes=datasets)
for chunk in chunks:
i0, i1 = chunk['datasetIndices']
r0, r1 = chunk['ranges']
for j0, j1 in itertools.product(range(*r0), range(*r1)):
# This will raise KeyError if we have duplicates
all_comparisons.remove((i0, i1, j0, j1))
# Make sure we've touched everything (so our set is empty)
assert not all_comparisons

0 comments on commit cffabb3

Please sign in to comment.