Skip to content

Commit cc12af3

Browse files
Adding tests and docs
1 parent cc9189e commit cc12af3

File tree

6 files changed

+292
-3
lines changed

6 files changed

+292
-3
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
build/*
2-
tailblazer.egg-info/*
2+
tailblazer.egg-info/*
3+
.pytest_cache/*

README.md

+126
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,129 @@ tbzr.cume_tail_mean(x, tail=0.7)
1212
# Output
1313
>>> array([1. , 1.5, 2. , 2.5, 3. , 3.5])
1414
```
15+
16+
## Included Algorithms
17+
18+
The following sections provide an overview for the mechanics
19+
of the included algorithms and how the balance flexibility and
20+
efficiency.
21+
22+
### `pct_rank()`
23+
24+
This algorithim computes cumulative distribution rankings like
25+
`dplyr::cume_dist()` in R. This was a notable gap in the NumPy
26+
ecosystem and this function does so in an efficient way using
27+
in-place modification following an `np.argsort()`. This means
28+
that the sorting operation only occurs once and only two new
29+
vectors of memory are allocated. The actual implementation is a
30+
bit more complex, but this code captures the main idea:
31+
32+
```python
33+
# Some array
34+
x = array()
35+
36+
# Get a list of indices from `x` sorted
37+
indices = argsort(x)
38+
39+
# Create container for output
40+
pcts = empty(x.shape)
41+
42+
# Iterate over the indices in reverse
43+
# This means ties are ranked the same
44+
for i in reversed(indices):
45+
46+
# There is some code here that establishes:
47+
# - first: are we on the first iteration
48+
# - current: the current value
49+
# - previous: the previous value (to handle ties)
50+
# - last_pct: ther percentile from the last iteration
51+
# - position: the integer position in the rank
52+
# - n: the total number of values
53+
54+
if first or current == previous:
55+
pcts[i] = last_pct
56+
continue
57+
58+
pcts[i] = position / n
59+
60+
return pcts
61+
```
62+
63+
This is efficient because we allocate memory for the argsort
64+
and percentiles, but don't need to sort, unsort, or allocate
65+
any other temporary vectors!
66+
67+
### `cume_tail_mean()`
68+
69+
This function is a highly performant algorithim which computes
70+
the average of all values within the top Nth percentile of a
71+
vector below some quantile. In other words, if we iterate
72+
through a vector and look at the top 5% of values below or
73+
equal to that point, what is their mean? This is a challenging
74+
function to implement because it is tempting to write a for
75+
loop which iterates over the values, filters down the vector,
76+
and computes the means:
77+
78+
```python
79+
# Lazy implementation
80+
81+
# Some vector
82+
x = array()
83+
84+
# Create container
85+
means = empty(x.shape)
86+
87+
# Iterate over vector
88+
for i, v in enumerate(x):
89+
under = x[x <= i]
90+
means[i] = mean(under >= quantile(under, 0.95))
91+
92+
return means
93+
```
94+
95+
The implementation above is extremely computationally intensive
96+
because the quantiles are estimated once **for every value**.
97+
Instead, we can build on our `pct_rank()` function from above
98+
and leverage the power of rescaling to determine a tail threshold
99+
for every value.
100+
101+
Suppose we are at some value in the vector. We can use `pct_rank()`
102+
to tell us its cumulative percentile. Maybe this value is 0.45. We
103+
can approximate the 95th percentile using the current value percentile!
104+
Multiply `0.45 * 0.95` and voila the current 95th tail is `0.4725`.
105+
Since we now know the 95th percentile for every value, we can iterate
106+
through the vector and keep track of how many items are in the current
107+
tail and their sums.
108+
109+
The basic idea is to iterate through the vector, add the current value
110+
to the tail, and strip away values which are now excluded. I call this
111+
algorithm the catch-up algorithm because the tail is catching up to the
112+
current value:
113+
114+
```python
115+
# Some array
116+
x = array()
117+
118+
# Get a list of indices and percentiles
119+
indices = argsort(x)
120+
pcts = pct_rank(x)
121+
means = empty(np.shape)
122+
123+
# Establish the lower bound for the tail
124+
tail_floor = 0
125+
126+
for i, v in enumerate(indices):
127+
128+
curr_tail = 0.95 * pcts[i]
129+
tail_sum += v
130+
n_tail += 1
131+
132+
while pcts[indices[i]] < curr_tail]:
133+
tail_sum -= x[indices[tail_floor]]
134+
n_tail -= 1
135+
tail_floor += 1
136+
137+
means[i] = tail_sum / n_tail
138+
139+
return means
140+
```

tailblazer/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
from .functions import (
2+
from .compute import (
33
pct_rank,
44
cume_tail_mean
55
)

tailblazer/functions.py tailblazer/compute.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import numpy as np
33
from numpy.typing import NDArray
44

5+
#### Core Mechanics in 1D ####
6+
57
def pct_rank_1d(x: NDArray) -> NDArray:
68
'''
79
Internal implementation of cumulative
@@ -71,6 +73,8 @@ def cume_tail_mean_1d(x: NDArray, tail: float=0.95) -> NDArray:
7173

7274
return mean_vec
7375

76+
#### Internal Helper ####
77+
7478
def apply_multidim(x: NDArray, axis: int, target_fun: callable) -> NDArray:
7579
'''
7680
An internal helper function designed to make
@@ -79,11 +83,23 @@ def apply_multidim(x: NDArray, axis: int, target_fun: callable) -> NDArray:
7983
handling.
8084
'''
8185

86+
try:
87+
x = np.array(x, dtype=float)
88+
except:
89+
raise TypeError('`x` must be an array or coercible to an array')
90+
91+
if np.isnan(x).any():
92+
# NOTE: Consider adding support for nan values later
93+
raise ValueError('`x` cannot contain nan values')
94+
95+
if axis not in (0, 1):
96+
raise ValueError('`axis` must be 0 or 1')
97+
8298
n_dim = len(x.shape)
8399
transpose = n_dim == 2 and axis == 0
84100

85101
if n_dim >= 3:
86-
raise ValueError('Only 0, 1, and 2D arrays are supported')
102+
raise ValueError('Only 1, and 2D arrays are supported')
87103

88104
flip = lambda x: x.T if transpose else x
89105

@@ -92,6 +108,8 @@ def apply_multidim(x: NDArray, axis: int, target_fun: callable) -> NDArray:
92108

93109
return target_fun(x)
94110

111+
#### Exported Algorithms ####
112+
95113
def pct_rank(x: NDArray, axis: int=0) -> NDArray:
96114
'''
97115
Computes cumulative distribution over a NumPy

tests/test_cume_tail_mean.py

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import tailblazer as tbzr
2+
import numpy as np
3+
import pytest
4+
5+
#### Test Data ####
6+
7+
# Simple arrays
8+
x = np.array([1, 2, 3, 4, 5, 6, 7, 8])
9+
y = np.array([1.0, 1.5, 2.5, 3.0, 4.0, 4.5, 5.5, 6.0])
10+
11+
# Shuffled arrays
12+
shuffle_idx = [0, 7, 1, 2, 6, 3, 5, 4]
13+
x_shuf = np.array([x[i] for i in shuffle_idx])
14+
y_shuf = np.array([y[i] for i in shuffle_idx])
15+
16+
# 2D arrays
17+
x_2d = np.array([[1, 2, 3, 4], [3, 4, 5, 6]])
18+
y_2d_a0 = np.array([[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0]])
19+
y_2d_a1 = np.array([[1.0, 1.5, 2.5, 3.0], [3.0, 3.5, 4.5, 5.0]])
20+
21+
# Special objects
22+
x_nan = np.array([i if i != 1 else np.nan for i in x])
23+
empty_arr = np.array([])
24+
weird_data = {'weird': 'data'}
25+
three_dim_arr = [[[1, 2]]]
26+
27+
#### Assertion Tests ####
28+
29+
def test_basic_tail_mean_usage():
30+
'The most simplistic case on a 1D array'
31+
32+
assert all(tbzr.cume_tail_mean(x, tail=0.5) == y)
33+
34+
def test_unordered_input():
35+
'The same simple case but shuffled'
36+
37+
assert all(tbzr.cume_tail_mean(x_shuf, tail=0.5) == y_shuf)
38+
39+
def test_empty_array():
40+
'Allows for empty arrays to simply pass through'
41+
42+
assert tbzr.cume_tail_mean(empty_arr).shape == (0, )
43+
44+
def test_missing_values():
45+
'Ensures nan values are halted'
46+
47+
with pytest.raises(ValueError):
48+
tbzr.cume_tail_mean(x_nan)
49+
50+
def test_weird_objects():
51+
'Ensures we must have a valid numeric array'
52+
53+
with pytest.raises(TypeError):
54+
tbzr.cume_tail_mean(weird_data)
55+
56+
def test_2d_object():
57+
'Tests operating on different axes'
58+
59+
assert np.all(tbzr.cume_tail_mean(x_2d, 0, 0.5) == y_2d_a0)
60+
assert np.all(tbzr.cume_tail_mean(x_2d, 1, 0.5) == y_2d_a1)
61+
62+
def test_3d_objects():
63+
'Tests halting on 3D+ arrays'
64+
65+
with pytest.raises(ValueError):
66+
tbzr.cume_tail_mean(three_dim_arr)
67+
68+
def test_werid_axis():
69+
'Tests halting on invalid axis'
70+
71+
with pytest.raises(ValueError):
72+
tbzr.cume_tail_mean(x, axis=0.5)

tests/test_pct_rank.py

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import tailblazer as tbzr
2+
import numpy as np
3+
import pytest
4+
5+
#### Test Data ####
6+
7+
# Simple arrays
8+
x = np.array([1, 2, 3, 4, 5, 6, 7, 8])
9+
y = np.array([0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0])
10+
11+
# Shuffled arrays
12+
shuffle_idx = [0, 7, 1, 2, 6, 3, 5, 4]
13+
x_shuf = np.array([x[i] for i in shuffle_idx])
14+
y_shuf = np.array([y[i] for i in shuffle_idx])
15+
16+
# 2D arrays
17+
x_2d = np.array([[1, 2], [3, 4]])
18+
y_2d_a0 = np.array([[0.5, 0.5], [1.0, 1.0]])
19+
y_2d_a1 = np.array([[0.5, 1.0], [0.5, 1.0]])
20+
21+
# Special objects
22+
x_nan = np.array([i if i != 1 else np.nan for i in x])
23+
empty_arr = np.array([])
24+
weird_data = {'weird': 'data'}
25+
three_dim_arr = [[[1, 2]]]
26+
27+
#### Assertion Tests ####
28+
29+
def test_basic_pct_rank_usage():
30+
'The most simplistic case on a 1D array'
31+
32+
assert all(tbzr.pct_rank(x) == y)
33+
34+
def test_unordered_input():
35+
'The same simple case but shuffled'
36+
37+
assert all(tbzr.pct_rank(x_shuf) == y_shuf)
38+
39+
def test_empty_array():
40+
'Allows for empty arrays to simply pass through'
41+
42+
assert tbzr.pct_rank(empty_arr).shape == (0, )
43+
44+
def test_missing_values():
45+
'Ensures nan values are halted'
46+
47+
with pytest.raises(ValueError):
48+
tbzr.pct_rank(x_nan)
49+
50+
def test_weird_objects():
51+
'Ensures we must have a valid numeric array'
52+
53+
with pytest.raises(TypeError):
54+
tbzr.pct_rank(weird_data)
55+
56+
def test_2d_object():
57+
'Tests operating on different axes'
58+
59+
assert np.all(tbzr.pct_rank(x_2d, 0) == y_2d_a0)
60+
assert np.all(tbzr.pct_rank(x_2d, 1) == y_2d_a1)
61+
62+
def test_3d_objects():
63+
'Tests halting on 3D+ arrays'
64+
65+
with pytest.raises(ValueError):
66+
tbzr.pct_rank(three_dim_arr)
67+
68+
def test_werid_axis():
69+
'Tests halting on invalid axis'
70+
71+
with pytest.raises(ValueError):
72+
tbzr.pct_rank(x, axis=0.5)

0 commit comments

Comments
 (0)