Adding tests and docs

walkerjameschris · walkerjameschris · commit cc12af33d27c · 2024-07-23T14:29:42.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 build/*
-tailblazer.egg-info/*
+tailblazer.egg-info/*
+.pytest_cache/*
diff --git a/README.md b/README.md
@@ -12,3 +12,129 @@ tbzr.cume_tail_mean(x, tail=0.7)
 # Output
 >>> array([1. , 1.5, 2. , 2.5, 3. , 3.5])
 ```
+
+## Included Algorithms
+
+The following sections provide an overview for the mechanics
+of the included algorithms and how the balance flexibility and
+efficiency.
+
+### `pct_rank()`
+
+This algorithim computes cumulative distribution rankings like
+`dplyr::cume_dist()` in R. This was a notable gap in the NumPy
+ecosystem and this function does so in an efficient way using
+in-place modification following an `np.argsort()`. This means
+that the sorting operation only occurs once and only two new
+vectors of memory are allocated. The actual implementation is a
+bit more complex, but this code captures the main idea:
+
+```python
+# Some array
+x = array()
+
+# Get a list of indices from `x` sorted
+indices = argsort(x)
+
+# Create container for output
+pcts = empty(x.shape)
+
+# Iterate over the indices in reverse
+# This means ties are ranked the same
+for i in reversed(indices):
+
+    # There is some code here that establishes:
+    # - first: are we on the first iteration
+    # - current: the current value
+    # - previous: the previous value (to handle ties)
+    # - last_pct: ther percentile from the last iteration
+    # - position: the integer position in the rank
+    # - n: the total number of values
+
+    if first or current == previous:
+        pcts[i] = last_pct
+        continue
+
+    pcts[i] = position / n
+
+return pcts
+```
+
+This is efficient because we allocate memory for the argsort
+and percentiles, but don't need to sort, unsort, or allocate
+any other temporary vectors!
+
+### `cume_tail_mean()`
+
+This function is a highly performant algorithim which computes
+the average of all values within the top Nth percentile of a
+vector below some quantile. In other words, if we iterate
+through a vector and look at the top 5% of values below or
+equal to that point, what is their mean? This is a challenging
+function to implement because it is tempting to write a for
+loop which iterates over the values, filters down the vector,
+and computes the means:
+
+```python
+# Lazy implementation
+
+# Some vector
+x = array()
+
+# Create container
+means = empty(x.shape)
+
+# Iterate over vector
+for i, v in enumerate(x):
+    under = x[x <= i]
+    means[i] = mean(under >= quantile(under, 0.95))
+
+return means
+```
+
+The implementation above is extremely computationally intensive
+because the quantiles are estimated once **for every value**.
+Instead, we can build on our `pct_rank()` function from above
+and leverage the power of rescaling to determine a tail threshold
+for every value.
+
+Suppose we are at some value in the vector. We can use `pct_rank()`
+to tell us its cumulative percentile. Maybe this value is 0.45. We
+can approximate the 95th percentile using the current value percentile!
+Multiply `0.45 * 0.95` and voila the current 95th tail is `0.4725`.
+Since we now know the 95th percentile for every value, we can iterate
+through the vector and keep track of how many items are in the current
+tail and their sums.
+
+The basic idea is to iterate through the vector, add the current value
+to the tail, and strip away values which are now excluded. I call this
+algorithm the catch-up algorithm because the tail is catching up to the
+current value:
+
+```python
+# Some array
+x = array()
+
+# Get a list of indices and percentiles
+indices = argsort(x)
+pcts = pct_rank(x)
+means = empty(np.shape)
+
+# Establish the lower bound for the tail
+tail_floor = 0
+
+for i, v in enumerate(indices):
+
+    curr_tail = 0.95 * pcts[i]
+    tail_sum += v
+    n_tail += 1
+
+    while pcts[indices[i]] < curr_tail]:
+        tail_sum -= x[indices[tail_floor]]
+        n_tail -= 1
+        tail_floor += 1
+
+    means[i] = tail_sum / n_tail
+
+return means
+```
diff --git a/tailblazer/__init__.py b/tailblazer/__init__.py
@@ -1,5 +1,5 @@
 
-from .functions import (
+from .compute import (
     pct_rank,
     cume_tail_mean
 )
diff --git a/tailblazer/compute.py b/tailblazer/compute.py
@@ -2,6 +2,8 @@
 import numpy as np
 from numpy.typing import NDArray
 
+#### Core Mechanics in 1D ####
+
 def pct_rank_1d(x: NDArray) -> NDArray:
     '''
     Internal implementation of cumulative
@@ -71,6 +73,8 @@ def cume_tail_mean_1d(x: NDArray, tail: float=0.95) -> NDArray:
 
     return mean_vec
 
+#### Internal Helper ####
+
 def apply_multidim(x: NDArray, axis: int, target_fun: callable) -> NDArray:
     '''
     An internal helper function designed to make
@@ -79,11 +83,23 @@ def apply_multidim(x: NDArray, axis: int, target_fun: callable) -> NDArray:
     handling.
     '''
 
+    try:
+        x = np.array(x, dtype=float)
+    except:
+        raise TypeError('`x` must be an array or coercible to an array')
+
+    if np.isnan(x).any():
+        # NOTE: Consider adding support for nan values later
+        raise ValueError('`x` cannot contain nan values')
+
+    if axis not in (0, 1):
+        raise ValueError('`axis` must be 0 or 1')
+
     n_dim = len(x.shape)
     transpose = n_dim == 2 and axis == 0
 
     if n_dim >= 3:
-        raise ValueError('Only 0, 1, and 2D arrays are supported')
+        raise ValueError('Only 1, and 2D arrays are supported')
 
     flip = lambda x: x.T if transpose else x
 
@@ -92,6 +108,8 @@ def apply_multidim(x: NDArray, axis: int, target_fun: callable) -> NDArray:
 
     return target_fun(x)
 
+#### Exported Algorithms ####
+
 def pct_rank(x: NDArray, axis: int=0) -> NDArray:
     '''
     Computes cumulative distribution over a NumPy
diff --git a/tests/test_cume_tail_mean.py b/tests/test_cume_tail_mean.py
@@ -0,0 +1,72 @@
+import tailblazer as tbzr
+import numpy as np
+import pytest
+
+#### Test Data ####
+
+# Simple arrays
+x = np.array([1, 2, 3, 4, 5, 6, 7, 8])
+y = np.array([1.0, 1.5, 2.5, 3.0, 4.0, 4.5, 5.5, 6.0])
+
+# Shuffled arrays
+shuffle_idx = [0, 7, 1, 2, 6, 3, 5, 4]
+x_shuf = np.array([x[i] for i in shuffle_idx])
+y_shuf = np.array([y[i] for i in shuffle_idx])
+
+# 2D arrays
+x_2d = np.array([[1, 2, 3, 4], [3, 4, 5, 6]])
+y_2d_a0 = np.array([[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0]])
+y_2d_a1 = np.array([[1.0, 1.5, 2.5, 3.0], [3.0, 3.5, 4.5, 5.0]])
+
+# Special objects
+x_nan = np.array([i if i != 1 else np.nan for i in x])
+empty_arr = np.array([])
+weird_data = {'weird': 'data'}
+three_dim_arr = [[[1, 2]]]
+
+#### Assertion Tests ####
+
+def test_basic_tail_mean_usage():
+    'The most simplistic case on a 1D array'
+
+    assert all(tbzr.cume_tail_mean(x, tail=0.5) == y)
+
+def test_unordered_input():
+    'The same simple case but shuffled'
+
+    assert all(tbzr.cume_tail_mean(x_shuf, tail=0.5) == y_shuf)
+
+def test_empty_array():
+    'Allows for empty arrays to simply pass through'
+
+    assert tbzr.cume_tail_mean(empty_arr).shape == (0, )
+
+def test_missing_values():
+    'Ensures nan values are halted'
+
+    with pytest.raises(ValueError):
+        tbzr.cume_tail_mean(x_nan)
+
+def test_weird_objects():
+    'Ensures we must have a valid numeric array'
+
+    with pytest.raises(TypeError):
+        tbzr.cume_tail_mean(weird_data)
+
+def test_2d_object():
+    'Tests operating on different axes'
+
+    assert np.all(tbzr.cume_tail_mean(x_2d, 0, 0.5) == y_2d_a0)
+    assert np.all(tbzr.cume_tail_mean(x_2d, 1, 0.5) == y_2d_a1)
+
+def test_3d_objects():
+    'Tests halting on 3D+ arrays'
+
+    with pytest.raises(ValueError):
+        tbzr.cume_tail_mean(three_dim_arr)
+
+def test_werid_axis():
+    'Tests halting on invalid axis'
+
+    with pytest.raises(ValueError):
+        tbzr.cume_tail_mean(x, axis=0.5)
diff --git a/tests/test_pct_rank.py b/tests/test_pct_rank.py
@@ -0,0 +1,72 @@
+import tailblazer as tbzr
+import numpy as np
+import pytest
+
+#### Test Data ####
+
+# Simple arrays
+x = np.array([1, 2, 3, 4, 5, 6, 7, 8])
+y = np.array([0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0])
+
+# Shuffled arrays
+shuffle_idx = [0, 7, 1, 2, 6, 3, 5, 4]
+x_shuf = np.array([x[i] for i in shuffle_idx])
+y_shuf = np.array([y[i] for i in shuffle_idx])
+
+# 2D arrays
+x_2d = np.array([[1, 2], [3, 4]])
+y_2d_a0 = np.array([[0.5, 0.5], [1.0, 1.0]])
+y_2d_a1 = np.array([[0.5, 1.0], [0.5, 1.0]])
+
+# Special objects
+x_nan = np.array([i if i != 1 else np.nan for i in x])
+empty_arr = np.array([])
+weird_data = {'weird': 'data'}
+three_dim_arr = [[[1, 2]]]
+
+#### Assertion Tests ####
+
+def test_basic_pct_rank_usage():
+    'The most simplistic case on a 1D array'
+
+    assert all(tbzr.pct_rank(x) == y)
+
+def test_unordered_input():
+    'The same simple case but shuffled'
+
+    assert all(tbzr.pct_rank(x_shuf) == y_shuf)
+
+def test_empty_array():
+    'Allows for empty arrays to simply pass through'
+
+    assert tbzr.pct_rank(empty_arr).shape == (0, )
+
+def test_missing_values():
+    'Ensures nan values are halted'
+
+    with pytest.raises(ValueError):
+        tbzr.pct_rank(x_nan)
+
+def test_weird_objects():
+    'Ensures we must have a valid numeric array'
+
+    with pytest.raises(TypeError):
+        tbzr.pct_rank(weird_data)
+
+def test_2d_object():
+    'Tests operating on different axes'
+
+    assert np.all(tbzr.pct_rank(x_2d, 0) == y_2d_a0)
+    assert np.all(tbzr.pct_rank(x_2d, 1) == y_2d_a1)
+
+def test_3d_objects():
+    'Tests halting on 3D+ arrays'
+
+    with pytest.raises(ValueError):
+        tbzr.pct_rank(three_dim_arr)
+
+def test_werid_axis():
+    'Tests halting on invalid axis'
+
+    with pytest.raises(ValueError):
+        tbzr.pct_rank(x, axis=0.5)

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`
`2`		`-from .functions import (`
	`2`	`+from .compute import (`
`3`	`3`	`pct_rank,`
`4`	`4`	`cume_tail_mean`
`5`	`5`	`)`