Skip to content

Commit

Permalink
Merge pull request #88 from explosion/feature/cache
Browse files Browse the repository at this point in the history
Make Vectors.most_similar super fast by loading from cache
  • Loading branch information
honnibal authored Nov 21, 2019
2 parents 7e8d72a + 9e9a8d3 commit 0c0965f
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 87 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ models.
- spaCy **pipeline component** and **extension attributes**.
- Fully **serializable** so you can easily ship your sense2vec vectors with your
spaCy model packages.
- Optional **caching of nearest neighbors** for super fast "most similar"
queries.
- **Train your own vectors** using a pretrained spaCy model, raw text and
[GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via
[fastText](https://github.com/facebookresearch/fastText)
Expand Down Expand Up @@ -417,7 +419,9 @@ assert s2v.similarity("machine_learning|NOUN", "machine_learning|NOUN") == 1.0
#### <kbd>method</kbd> `Sense2Vec.most_similar`

Get the most similar entries in the table. If more than one key is provided, the
average of the vectors is used.
average of the vectors is used. To make this method faster, see the
[script for precomputing a cache](scripts/06_precompute_cache.py) of the nearest
neighbors.

| Argument | Type | Description |
| ------------ | ------------------------- | ------------------------------------------------------- |
Expand Down Expand Up @@ -699,6 +703,7 @@ clone and `make`.
| **3.** | [`03_glove_build_counts.py`](scripts/03_glove_build_counts.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) to build the vocabulary and counts. Skip this step if you're using Word2Vec via [FastText](https://github.com/facebookresearch/fastText). |
| **4.** | [`04_glove_train_vectors.py`](scripts/04_glove_train_vectors.py)<br />[`04_fasttext_train_vectors.py`](scripts/04_fasttext_train_vectors.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) or [FastText](https://github.com/facebookresearch/fastText) to train vectors. |
| **5.** | [`05_export.py`](scripts/05_export.py) | Load the vectors and frequencies and output a sense2vec component that can be loaded via `Sense2Vec.from_disk`. |
| **6.** | [`06_precompute_cache.py`](scripts/06_precompute_cache.py) | **Optional:** Precompute nearest-neighbor queries for every entry in the vocab to make `Sense2Vec.most_similar` faster. |

For more detailed documentation of the scripts, check out the source or run them
with `--help`. For example, `python scripts/01_parse.py --help`.
Expand Down
157 changes: 157 additions & 0 deletions scripts/06_precompute_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import plac
import tqdm
import numpy
import srsly
from wasabi import msg
from pathlib import Path


@plac.annotations(
vectors=("Path to sense2vec component directory", "positional", None, str),
gpu_id=("GPU device (-1 for CPU)", "option", "g", int),
n_neighbors=("Number of neighbors to cache", "option", "n", int),
batch_size=("Batch size for to reduce memory usage.", "option", "b", int),
cutoff=("Limit neighbors to this many earliest rows", "option", "c", int,),
start=("Index of vectors to start at.", "option", "s", int),
end=("Index of vectors to stop at.", "option", "e", int),
)
def main(
vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None
):
"""
Step 6: Precompute nearest-neighbor queries (optional)
Precompute nearest-neighbor queries for every entry in the vocab to make
Sense2Vec.most_similar faster. The --cutoff option lets you define the
number of earliest rows to limit the neighbors to. For instance, if cutoff
is 100000, no word will have a nearest neighbor outside of the top 100k
vectors.
"""
if gpu_id == -1:
xp = numpy
else:
import cupy as xp
import cupy.cuda.device

cupy.take_along_axis = take_along_axis
device = cupy.cuda.device.Device(gpu_id)
device.use()
vectors_dir = Path(vectors)
vectors_file = vectors_dir / "vectors"
if not vectors_dir.is_dir() or not vectors_file.exists():
err = "Are you passing in the exported sense2vec directory containing a vectors file?"
msg.fail(f"Can't load vectors from {vectors}", err, exits=1)
with msg.loading(f"Loading vectors from {vectors}"):
vectors = xp.load(str(vectors_file))
msg.good(f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}")
norms = xp.linalg.norm(vectors, axis=1, keepdims=True)
norms[norms == 0] = 1
# Normalize to unit norm
vectors /= norms
if cutoff < 1:
cutoff = vectors.shape[0]
if end is None:
end = vectors.shape[0]
mean = float(norms.mean())
var = float(norms.var())
msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})")
msg.info(f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent")
n = min(n_neighbors, vectors.shape[0])
best_rows = xp.zeros((end - start, n), dtype="i")
scores = xp.zeros((end - start, n), dtype="f")
# Pre-allocate this array, so we can use it each time.
subset = xp.ascontiguousarray(vectors[:cutoff])
sims = xp.zeros((batch_size, cutoff), dtype="f")
indices = xp.arange(cutoff).reshape((-1, 1))
for i in tqdm.tqdm(list(range(start, end, batch_size))):
batch = vectors[i : i + batch_size]
# batch e.g. (1024, 300)
# vectors e.g. (10000, 300)
# sims e.g. (1024, 10000)
if batch.shape[0] == sims.shape[0]:
xp.dot(batch, subset.T, out=sims)
else:
# In the last batch we'll have a different size.
sims = xp.dot(batch, subset.T)
size = sims.shape[0]
# Get the indices and scores for the top N most similar for each in the
# batch. This is a bit complicated, to avoid sorting all of the scores
# -- we only want the top N to be sorted (which we do later). For now,
# we use argpartition to just get the cut point.
neighbors = xp.argpartition(sims, -n, axis=1)[:, -n:]
neighbor_sims = xp.partition(sims, -n, axis=1)[:, -n:]
# Can't figure out how to do this without the loop.
for j in range(min(end - i, size)):
best_rows[i + j] = neighbors[j]
scores[i + j] = neighbor_sims[j]
# Sort in reverse order
indices = xp.argsort(scores, axis=1)[:, ::-1]
scores = xp.take_along_axis(scores, indices, axis=1)
best_rows = xp.take_along_axis(best_rows, indices, axis=1)
msg.info("Saving output")
if not isinstance(best_rows, numpy.ndarray):
best_rows = best_rows.get()
if not isinstance(scores, numpy.ndarray):
scores = scores.get()
output = {
"indices": best_rows,
"scores": scores.astype("float16"),
"start": start,
"end": end,
"cutoff": cutoff,
}
output_file = vectors_dir / "cache"
with msg.loading("Saving output..."):
srsly.write_msgpack(output_file, output)
msg.good(f"Saved cache to {output_file}")


# These functions are missing from cupy, but will be supported in cupy 7.
def take_along_axis(a, indices, axis):
"""Take values from the input array by matching 1d index and data slices.
Args:
a (cupy.ndarray): Array to extract elements.
indices (cupy.ndarray): Indices to take along each 1d slice of ``a``.
axis (int): The axis to take 1d slices along.
Returns:
cupy.ndarray: The indexed result.
.. seealso:: :func:`numpy.take_along_axis`
"""
import cupy

if indices.dtype.kind not in ("i", "u"):
raise IndexError("`indices` must be an integer array")

if axis is None:
a = a.ravel()
axis = 0

ndim = a.ndim

if not (-ndim <= axis < ndim):
raise IndexError("Axis overrun")

axis %= a.ndim

if ndim != indices.ndim:
raise ValueError("`indices` and `a` must have the same number of dimensions")

fancy_index = []
for i, n in enumerate(a.shape):
if i == axis:
fancy_index.append(indices)
else:
ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1)
fancy_index.append(cupy.arange(n).reshape(ind_shape))

return a[fancy_index]


if __name__ == "__main__":
try:
plac.call(main)
except KeyboardInterrupt:
msg.warn("Cancelled.")
80 changes: 0 additions & 80 deletions scripts/06_precompute_neighbors.py

This file was deleted.

27 changes: 25 additions & 2 deletions sense2vec/sense2vec.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Tuple, List, Union, Sequence, Dict, Callable
from typing import Tuple, List, Union, Sequence, Dict, Callable, Any
from pathlib import Path
from spacy.vectors import Vectors
from spacy.strings import StringStore
Expand Down Expand Up @@ -32,7 +32,12 @@ def __init__(
self.vectors = Vectors(shape=shape, name=vectors_name)
self.strings = StringStore() if strings is None else strings
self.freqs: Dict[int, int] = {}
self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"}
self.cache = None
self.cfg: Dict[str, Any] = {
"senses": senses,
"make_key": "default",
"split_key": "default",
}
self.cfg.update(overrides)

@property
Expand Down Expand Up @@ -202,6 +207,15 @@ def most_similar(
raise ValueError(f"Can't find key {key} in table")
if len(self.vectors) < n_similar:
n_similar = len(self.vectors)
if self.cache:
indices = self.cache.get("indices", [])
scores = self.cache.get("scores", [])
if len(indices) >= n_similar:
key_row = self.vectors.find(key=key)
sim_keys = self.vectors.find(rows=indices[key_row][:n_similar])
sim_scores = scores[key_row][:n_similar]
result = [(self.strings[k], s) for k, s in zip(sim_keys, sim_scores)]
return [(key, score) for key, score in result if key not in keys]
vecs = numpy.vstack([self[key] for key in keys])
average = vecs.mean(axis=0, keepdims=True)
result_keys, _, scores = self.vectors.most_similar(
Expand Down Expand Up @@ -269,6 +283,8 @@ def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes:
data = {"vectors": vectors_bytes, "cfg": self.cfg, "freqs": freqs}
if "strings" not in exclude:
data["strings"] = self.strings.to_bytes()
if "cache" not in exclude:
data["cache"] = self.cache
return srsly.msgpack_dumps(data)

def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
Expand All @@ -284,6 +300,8 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
self.cfg.update(data.get("cfg", {}))
if "strings" not in exclude and "strings" in data:
self.strings = StringStore().from_bytes(data["strings"])
if "cache" not in exclude and "cache" in data:
self.cache = data.get("cache", {})
return self

def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
Expand All @@ -298,6 +316,8 @@ def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
srsly.write_json(path / "freqs.json", list(self.freqs.items()))
if "strings" not in exclude:
self.strings.to_disk(path / "strings.json")
if "cache" not in exclude and self.cache:
srsly.write_msgpack(path / "cache", self.cache)

def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
"""Load a Sense2Vec object from a directory.
Expand All @@ -309,10 +329,13 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
path = Path(path)
strings_path = path / "strings.json"
freqs_path = path / "freqs.json"
cache_path = path / "cache"
self.vectors = Vectors().from_disk(path)
self.cfg.update(srsly.read_json(path / "cfg"))
if freqs_path.exists():
self.freqs = dict(srsly.read_json(freqs_path))
if "strings" not in exclude and strings_path.exists():
self.strings = StringStore().from_disk(strings_path)
if "cache" not in exclude and cache_path.exists():
self.cache = srsly.read_msgpack(cache_path)
return self
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[metadata]
version = 1.0.0a8
version = 1.0.0a9
description = Use NLP to go beyond vanilla word2vec
url = https://github.com/explosion/sense2vec
author = Explosion
Expand Down
Binary file added tests/data/cache
Binary file not shown.
25 changes: 22 additions & 3 deletions tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest
from pathlib import Path
from sense2vec import Sense2Vec
import numpy


@pytest.fixture
Expand All @@ -10,10 +11,28 @@ def s2v():


def test_model_most_similar(s2v):
s2v.cache = None
assert "beekeepers|NOUN" in s2v
result = s2v.most_similar(["beekeepers|NOUN"], n=2)
assert result[0][0] == "honey_bees|NOUN"
assert result[1][0] == "Beekeepers|NOUN"
((key1, _), (key2, _)) = s2v.most_similar(["beekeepers|NOUN"], n=2)
assert key1 == "honey_bees|NOUN"
assert key2 == "Beekeepers|NOUN"


def test_model_most_similar_cache(s2v):
query = "beekeepers|NOUN"
assert s2v.cache
assert query in s2v
# Modify cache to test that the cache is used and values aren't computed
query_row = s2v.vectors.find(key=s2v.ensure_int_key(query))
scores = numpy.array(s2v.cache["scores"], copy=True) # otherwise not writable
scores[query_row, 1] = 2.0
scores[query_row, 2] = 3.0
s2v.cache["scores"] = scores
((key1, score1), (key2, score2)) = s2v.most_similar([query], n=2)
assert key1 == "honey_bees|NOUN"
assert score1 == 2.0
assert key2 == "Beekeepers|NOUN"
assert score2 == 3.0


def test_model_other_senses(s2v):
Expand Down

0 comments on commit 0c0965f

Please sign in to comment.