Skip to content

Commit

Permalink
Release v0.8.1
Browse files Browse the repository at this point in the history
  • Loading branch information
hardbyte authored May 18, 2018
2 parents 23a7be7 + f10b6b3 commit aebdbba
Show file tree
Hide file tree
Showing 8 changed files with 31 additions and 23 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
0.8.1
-----

Minor updates and fixes. Code cleanup.
- Remove checking of chunk size to prevent crashes on small chunks.

0.8.0
-----

Expand Down
5 changes: 3 additions & 2 deletions anonlink/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def compute_comparison_speed(n1, n2, threshold):
start = timer()
sparse_matrix = calculate_filter_similarity(filters1, filters2, len(filters2), threshold)
t1 = timer()
res = greedy_solver(sparse_matrix)
_ = greedy_solver(sparse_matrix)
end = timer()

similarity_time = t1 - start
Expand Down Expand Up @@ -176,6 +176,7 @@ def benchmark(size, compare):
if test_size <= size:
compute_comparison_speed(test_size, test_size, thld)


if __name__ == '__main__':
benchmark(4000, False)
#benchmark(20000, False)
# benchmark(20000, False)
13 changes: 6 additions & 7 deletions anonlink/bloommatcher.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
from hashlib import sha1, md5
import hmac
from bitarray import bitarray
from anonlink._entitymatcher import ffi, lib

__author__ = 'Stephen Hardy, Brian Thorne'
Expand All @@ -12,7 +9,8 @@ def dicecoeff_pure_python(e1, e2):
Implemented exclusively in Python.
:param e1, e2: bitarrays of same length
:param e1: bitarray of same length as e2
:param e2: bitarray of same length as e1
:return: real 0-1 similarity measure
"""
count1 = e1.count()
Expand All @@ -24,19 +22,22 @@ def dicecoeff_pure_python(e1, e2):
else:
return 2.0 * overlap_count / combined_count


def dicecoeff_native(e1, e2):
"""
Dice coefficient measures the similarity of two bit patterns.
Implemented via an external library.
:param e1, e2: bitarrays of same length
:param e1: bitarray of same length as e2
:param e2: bitarray of same length as e1
:return: real 0-1 similarity measure
"""
e1array = ffi.new("char[]", e1.tobytes())
e2array = ffi.new("char[]", e2.tobytes())
return lib.dice_coeff(e1array, e2array, len(e1array))


def dicecoeff(e1, e2):
"""
Dice coefficient measures the similarity of two bit patterns
Expand Down Expand Up @@ -85,5 +86,3 @@ def tanimoto_precount(e1, e2, count):
"""
a = (e1 & e2).count()
return a / float(count - a)


3 changes: 2 additions & 1 deletion anonlink/distributed_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def calculate_filter_similarity(filters1, filters2, k, threshold):
:param filters1:
:param filters2:
:param k:
:param threshold:
:return:
"""

Expand All @@ -45,4 +47,3 @@ def calculate_filter_similarity(filters1, filters2, k, threshold):
results.extend(future.result())

return results

18 changes: 8 additions & 10 deletions anonlink/entitymatch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from itertools import repeat
import logging

from anonlink._entitymatcher import ffi, lib
Expand All @@ -6,7 +7,6 @@
from operator import itemgetter

from . import bloommatcher as bm
from . import util

log = logging.getLogger('anonlink.entitymatch')

Expand Down Expand Up @@ -97,11 +97,11 @@ def cffi_filter_similarity_k(filters1, filters2, k, threshold):
c_scores)

if matches < 0:
raise ValueError('Internel error: Bad key length')
for j in range(matches):
ind = c_indices[j]
assert ind < len(filters2)
result.append((i, c_scores[j], ind))
raise ValueError('Internal error: Bad key length')

# Take the first `matches` elements of c_scores and c_indices.
# Store them along with `i`.
result.extend(zip(repeat(i, matches), c_scores, c_indices))

return result

Expand Down Expand Up @@ -169,12 +169,10 @@ def calculate_filter_similarity(filters1, filters2, k, threshold, use_python=Fal
- the similarity score between 0 and 1 of the best match
- The index in filters2 of the best match
"""
MIN_LENGTH = 5
if len(filters1) < MIN_LENGTH or len(filters2) < MIN_LENGTH:
raise ValueError("Didn't meet minimum number of entities")
if not filters1 or not filters2:
raise ValueError('empty input')
# use C++ version by default
if use_python:
return python_filter_similarity(filters1, filters2, k, threshold)
else:
return cffi_filter_similarity_k(filters1, filters2, k, threshold)

6 changes: 4 additions & 2 deletions anonlink/network_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,14 +128,16 @@ def map_entities(weights, threshold, method=None):


if __name__ == "__main__":
import numpy as np
A = [[4.0, 3.0, 2.0, 1.0],
[1.0, 4.0, 3.0, 2.0],
[2.0, 1.0, 4.0, 3.0],
[2.5, 3.5, 4.5, 1.5]]

print("Threshold | Match | Entity Mapping")
for threshold in np.linspace(2.5, 3.5, 11):
n_thresholds = 10
for t in range(n_thresholds + 1):
# threshold will range from 2.5 to 3.5 inclusive in steps of 0.1
threshold = 2.5 + 1.0 * t / n_thresholds
entity_map = map_entities(A, threshold)
perfect_match = len(entity_map) == len(A)
print("{:9.3f} | {:5} | {:26s} ".format(threshold, perfect_match, entity_map))
1 change: 1 addition & 0 deletions anonlink/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from anonlink._entitymatcher import ffi, lib


def generate_bitarray(length):
a = bitarray(endian=['little', 'big'][random.randint(0, 1)])
a.frombytes(os.urandom(length//8))
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

setup(
name="anonlink",
version='0.8.0',
version='0.8.1',
description='Anonymous linkage using cryptographic hashes and bloom filters',
url='https://github.com/n1analytics/anonlink',
license='Apache',
Expand Down

0 comments on commit aebdbba

Please sign in to comment.