Source code for simpleneighbors

import pickle
from simpleneighbors.backends import select_best

__author__ = 'Allison Parrish'
__email__ = 'allison@decontextualize.com'
__version__ = '0.1.0'


[docs]class SimpleNeighbors: """A Simple Neighbors index. This class wraps backend implementations of approximate nearest neighbors indexes with a user-friendly API. When you instantiate this class, it will automatically select a backend implementation based on packages installed in your environment. It is HIGHLY RECOMMENDED that you install Annoy (``pip install annoy``) to enable the Annoy backend! (The alternatives are slower and not as accurate.) Alternatively, you can specify a backend of your choosing with the ``backend`` parameter. Specify the number of dimensions in your data (i.e., the length of the list or array you plan to provide for each item) and the distance metric you want to use. The default is ``angular`` distance, an approximation of cosine distance. This metric is supported by all backends, as is ``euclidean`` (for Euclidean distance). Both of these parameters are passed directly to the backend; see the backend documentation for more details. :param dims: the number of dimensions in your data :param metric: the distance metric to use :param backend: the nearest neighbors backend to use (default is annoy) """ def __init__(self, dims, metric="angular", backend=None): if backend is None: backend = select_best() self.dims = dims self.metric = metric self.id_map = {} self.corpus = [] self.backend = backend(dims, metric=metric) self.i = 0 self.built = False
[docs] def add_one(self, item, vector): """Adds an item to the index. You need to provide the item to add and a vector that corresponds to that item. (For example, if the item is the name of a color, the vector might be a (R, G, B) triplet corresponding to that color. If the item is a word, the vector might be a word2vec or GloVe vector corresponding to that word. Items can be any `hashable <https://docs.python.org/3.7/glossary.html#term-hashable>`_ Python object. Vectors must be sequences of numbers. (Lists, tuples, and Numpy arrays should all be fine, for example.) Note: If the index has already been built, you won't be able to add new items. :param item: the item to add :param vector: the vector corresponding to that item :returns: None """ assert self.built is False, "Index already built; can't add new items." self.backend.add_item(self.i, vector) self.id_map[item] = self.i self.corpus.append(item) self.i += 1
[docs] def feed(self, items): """Add multiple items to the index. Supply to this method a sequence of (item, vector) tuples (e.g., a list of tuples, a generator that produces tuples, etc.) and they'll all be added to the index. Great for adding multiple items to the index at once. Items can be any `hashable <https://docs.python.org/3.7/glossary.html#term-hashable>`_ Python object. Vectors must be sequences of numbers. (Lists, tuples, and Numpy arrays should all be fine, for example.) .. doctest: >>> from simpleneighbors import SimpleNeighbors >>> sim = SimpleNeighbors(2, 'euclidean') >>> sim.feed([('a', (4, 5)), ... ('b', (0, 3)), ... ('c', (-2, 8)), ... ('d', (2, -2))]) >>> len(sim) 4 :param items: a sequence of (item, vector) tuples :returns: None """ for item, vector in items: self.add_one(item, vector)
[docs] def build(self, n=10, params=None): """Build the index. After adding all of your items, call this method to build the index. The meaning of parameter ``n`` is different for each backend implementation. For the Annoy backend, it specifies the number of trees in the underlying Annoy index (a higher number will take longer to build but provide more precision when querying). For the Sklearn backend, the number specifies the leaf size when building the ball tree. (The Brute Force Pure Python backend ignores this value entirely.) After you call build, you'll no longer be able to add new items to the index. :param n: backend-dependent (for Annoy: number of trees) :param params: dictionary with extra parameters to pass to backend """ self.backend.build(n, params) self.built = True
[docs] def nearest(self, vec, n=12): """Returns the items nearest to a given vector. The specified vector must have the same number of dimensions as the number given when initializing the index. The nearest neighbor search is limited to the given number of items, and results are sorted in order of proximity. .. doctest:: >>> from simpleneighbors import SimpleNeighbors >>> sim = SimpleNeighbors(2, 'euclidean') >>> sim.feed([('a', (4, 5)), ... ('b', (0, 3)), ... ('c', (-2, 8)), ... ('d', (2, -2))]) >>> sim.build() >>> sim.nearest((1, -1), n=1) ['d'] :param vec: search vector :param n: number of results to return :returns: a list of items sorted in order of proximity """ return [self.corpus[idx] for idx in self.backend.get_nns_by_vector(vec, n)]
[docs] def neighbors(self, item, n=12): """Returns the items nearest another item in the index. This method returns the items closest to a given item in the index in order of proximity, limiting results to the number specified. (It's just like :func:`~simpleneighbors.SimpleNeighbors.nearest` except using the vector of an item already in the corpus.) .. doctest:: >>> from simpleneighbors import SimpleNeighbors >>> sim = SimpleNeighbors(2, 'euclidean') >>> sim.feed([('a', (4, 5)), ... ('b', (0, 3)), ... ('c', (-2, 8)), ... ('d', (2, -2))]) >>> sim.build() >>> sim.neighbors('b', n=3) ['b', 'a', 'c'] :param item: a data item in that has already been added to the index :param n: the number of items to return :returns: a list of items sorted in order of proximity """ return self.nearest(self.vec(item), n)
[docs] def nearest_matching(self, vec, n=12, check=lambda x: True): """nearest_matching(vec, n=12, check=lambda x: True) Returns the items nearest a given vector that pass a test. This method looks for the items in the index nearest the given vector that meet a particular criterion. It tries to find at least ``n`` items, expanding the search as needed. (It may yield fewer than the desired number if enough items can't be found in the entire index.) The function passed as ``check`` will be called with a single parameter: the item in question. It should return ``True`` if the item should be included in the results, and ``False`` otherwise. This search process might be slow; in order to make it easier to display incremental results, this method returns a generator. You can easily get the results of this method as a list by enclosing your call inside the ``list()`` function. .. doctest:: >>> from simpleneighbors import SimpleNeighbors >>> sim = SimpleNeighbors(2, 'euclidean') >>> sim.feed([('a', (4, 5)), ... ('b', (0, 3)), ... ('c', (-2, 8)), ... ('d', (2, -2))]) >>> sim.build() >>> list(sim.nearest_matching((3.5, 4.5), n=1, ... check=lambda x: ord(x) <= ord('b'))) ['a'] :param vec: search vector :param n: number of items to return :param check: function to call on each item :returns: a generator yielding up to ``n`` items """ found = set() current_n = n while True: for item in self.nearest(vec, current_n): if item not in found and check(item): yield item found.add(item) if len(found) == n: break if len(found) == n or current_n >= len(self.corpus): break else: current_n *= 2
[docs] def neighbors_matching(self, item, n=12, check=None): """Returns the items nearest an indexed item that pass a test. This method is just like :func:`~simpleneighbors.SimpleNeighbors.nearest_matching`, but finds items nearest a given item already in the index, instead of an arbitrary vector. :param item: search item :param n: number of items to return :param check: function to call on each item :returns: a generator yielding up to ``n`` items """ for item in self.nearest_matching(self.vec(item), n, check): yield item
[docs] def dist(self, a, b): """Returns the distance between two items. :param a: first item :param b: second item :returns: distance between ``a`` and ``b`` """ return self.backend.get_distance(self.id_map[a], self.id_map[b])
[docs] def vec(self, item): """Returns the vector for an item. This method returns the vector that was originally provided when indexing the specified item. (Depending on how it was originally specified, they may have been converted to a different data type; e.g., integer vectors are converted to floats.) :param item: item to lookup :returns: vector for item """ return self.backend.get_item_vector(self.id_map[item])
def __len__(self): """Returns the number of items in the vector""" return len(self.corpus)
[docs] def save(self, prefix): """Saves the index to disk. This method saves the index to disk. Each backend manages serialization a little bit differently: consult the documentation and source code for more details. For example, because Annoy indexes can't be serialized with `pickle`, the Annoy backend's implementation produces two files: the serialized Annoy index, and a pickle with the other data from the object. This method's parameter specifies the "prefix" to use for these files. :param prefix: filename prefix for Annoy index and object data :returns: None """ import pickle with open(prefix + "-data.pkl", "wb") as fh: pickle.dump({ 'id_map': self.id_map, 'corpus': self.corpus, 'i': self.i, 'built': self.built, 'metric': self.metric, 'dims': self.dims, '_backend_class': self.backend.__class__ }, fh) self.backend.save(prefix + ".idx")
[docs] @classmethod def load(cls, prefix): """Restores a previously-saved index. This class method restores a previously-saved index using the specified file prefix. :param prefix: prefix used when saving :returns: SimpleNeighbors object restored from specified files """ with open(prefix + "-data.pkl", "rb") as fh: data = pickle.load(fh) newobj = cls( dims=data['dims'], metric=data['metric'], backend=data['_backend_class'] ) newobj.id_map = data['id_map'] newobj.corpus = data['corpus'] newobj.i = data['i'] newobj.built = data['built'] newobj.backend.load(prefix + ".idx") return newobj