Source code for simpleneighbors

import pickle
from simpleneighbors.backends import select_best

__author__ = 'Allison Parrish'
__email__ = 'allison@decontextualize.com'
__version__ = '0.1.0'


[docs]class SimpleNeighbors:
    """A Simple Neighbors index.

    This class wraps backend implementations of approximate nearest neighbors
    indexes with a user-friendly API. When you instantiate this class, it will
    automatically select a backend implementation based on packages installed
    in your environment. It is HIGHLY RECOMMENDED that you install Annoy (``pip
    install annoy``) to enable the Annoy backend! (The alternatives are
    slower and not as accurate.) Alternatively, you can specify a backend of
    your choosing with the ``backend`` parameter.

    Specify the number of dimensions in your data (i.e., the length of the list
    or array you plan to provide for each item) and the distance metric you
    want to use. The default is ``angular`` distance, an approximation of
    cosine distance. This metric is supported by all backends, as is
    ``euclidean`` (for Euclidean distance). Both of these parameters are passed
    directly to the backend; see the backend documentation for more details.

    :param dims: the number of dimensions in your data
    :param metric: the distance metric to use
    :param backend: the nearest neighbors backend to use (default is annoy)
    """

    def __init__(self, dims, metric="angular", backend=None):

        if backend is None:
            backend = select_best()

        self.dims = dims
        self.metric = metric
        self.id_map = {}
        self.corpus = []
        self.backend = backend(dims, metric=metric)
        self.i = 0
        self.built = False

[docs]    def add_one(self, item, vector):
        """Adds an item to the index.

        You need to provide the item to add and a vector that corresponds to
        that item. (For example, if the item is the name of a color, the vector
        might be a (R, G, B) triplet corresponding to that color. If the item
        is a word, the vector might be a word2vec or GloVe vector corresponding
        to that word.

        Items can be any `hashable
        <https://docs.python.org/3.7/glossary.html#term-hashable>`_ Python
        object. Vectors must be sequences of numbers. (Lists, tuples, and Numpy
        arrays should all be fine, for example.)

        Note: If the index has already been built, you won't be able to add new
        items.

        :param item: the item to add
        :param vector: the vector corresponding to that item
        :returns: None
        """

        assert self.built is False, "Index already built; can't add new items."
        self.backend.add_item(self.i, vector)
        self.id_map[item] = self.i
        self.corpus.append(item)
        self.i += 1

[docs]    def feed(self, items):
        """Add multiple items to the index.

        Supply to this method a sequence of (item, vector) tuples (e.g., a list
        of tuples, a generator that produces tuples, etc.) and they'll all be
        added to the index.  Great for adding multiple items to the index at
        once.

        Items can be any `hashable
        <https://docs.python.org/3.7/glossary.html#term-hashable>`_ Python
        object. Vectors must be sequences of numbers. (Lists, tuples, and Numpy
        arrays should all be fine, for example.)

        .. doctest:

            >>> from simpleneighbors import SimpleNeighbors
            >>> sim = SimpleNeighbors(2, 'euclidean')
            >>> sim.feed([('a', (4, 5)),
            ...     ('b', (0, 3)),
            ...     ('c', (-2, 8)),
            ...     ('d', (2, -2))])
            >>> len(sim)
            4

        :param items: a sequence of (item, vector) tuples
        :returns: None
        """
        for item, vector in items:
            self.add_one(item, vector)

[docs]    def build(self, n=10, params=None):
        """Build the index.

        After adding all of your items, call this method to build the index.
        The meaning of parameter ``n`` is different for each backend
        implementation. For the Annoy backend, it specifies the number of trees
        in the underlying Annoy index (a higher number will take longer to
        build but provide more precision when querying). For the Sklearn
        backend, the number specifies the leaf size when building the ball
        tree. (The Brute Force Pure Python backend ignores this value
        entirely.)

        After you call build, you'll no longer be able to add new items to the
        index.

        :param n: backend-dependent (for Annoy: number of trees)
        :param params: dictionary with extra parameters to pass to backend
        """
        self.backend.build(n, params)
        self.built = True

[docs]    def nearest(self, vec, n=12):
        """Returns the items nearest to a given vector.

        The specified vector must have the same number of dimensions as the
        number given when initializing the index. The nearest neighbor search
        is limited to the given number of items, and results are sorted in
        order of proximity.

        .. doctest::

            >>> from simpleneighbors import SimpleNeighbors
            >>> sim = SimpleNeighbors(2, 'euclidean')
            >>> sim.feed([('a', (4, 5)),
            ...     ('b', (0, 3)),
            ...     ('c', (-2, 8)),
            ...     ('d', (2, -2))])
            >>> sim.build()
            >>> sim.nearest((1, -1), n=1)
            ['d']

        :param vec: search vector
        :param n: number of results to return
        :returns: a list of items sorted in order of proximity
        """

        return [self.corpus[idx] for idx
                in self.backend.get_nns_by_vector(vec, n)]

[docs]    def neighbors(self, item, n=12):
        """Returns the items nearest another item in the index.

        This method returns the items closest to a given item in the index in
        order of proximity, limiting results to the number specified. (It's
        just like :func:`~simpleneighbors.SimpleNeighbors.nearest` except using
        the vector of an item already in the corpus.)

        .. doctest::

            >>> from simpleneighbors import SimpleNeighbors
            >>> sim = SimpleNeighbors(2, 'euclidean')
            >>> sim.feed([('a', (4, 5)),
            ...     ('b', (0, 3)),
            ...     ('c', (-2, 8)),
            ...     ('d', (2, -2))])
            >>> sim.build()
            >>> sim.neighbors('b', n=3)
            ['b', 'a', 'c']

        :param item: a data item in that has already been added to the index
        :param n: the number of items to return
        :returns: a list of items sorted in order of proximity
        """

        return self.nearest(self.vec(item), n)

[docs]    def nearest_matching(self, vec, n=12, check=lambda x: True):
        """nearest_matching(vec, n=12, check=lambda x: True)
        Returns the items nearest a given vector that pass a test.

        This method looks for the items in the index nearest the given vector
        that meet a particular criterion. It tries to find at least ``n``
        items, expanding the search as needed.  (It may yield fewer than the
        desired number if enough items can't be found in the entire index.)

        The function passed as ``check`` will be called with a single
        parameter: the item in question. It should return ``True`` if the item
        should be included in the results, and ``False`` otherwise.

        This search process might be slow; in order to make it easier to
        display incremental results, this method returns a generator. You can
        easily get the results of this method as a list by enclosing your call
        inside the ``list()`` function.

        .. doctest::

            >>> from simpleneighbors import SimpleNeighbors
            >>> sim = SimpleNeighbors(2, 'euclidean')
            >>> sim.feed([('a', (4, 5)),
            ...     ('b', (0, 3)),
            ...     ('c', (-2, 8)),
            ...     ('d', (2, -2))])
            >>> sim.build()
            >>> list(sim.nearest_matching((3.5, 4.5), n=1,
            ...     check=lambda x: ord(x) <= ord('b')))
            ['a']

        :param vec: search vector
        :param n: number of items to return
        :param check: function to call on each item
        :returns: a generator yielding up to ``n`` items
        """

        found = set()
        current_n = n
        while True:
            for item in self.nearest(vec, current_n):
                if item not in found and check(item):
                    yield item
                    found.add(item)
                if len(found) == n:
                    break
            if len(found) == n or current_n >= len(self.corpus):
                break
            else:
                current_n *= 2

[docs]    def neighbors_matching(self, item, n=12, check=None):
        """Returns the items nearest an indexed item that pass a test.

        This method is just like
        :func:`~simpleneighbors.SimpleNeighbors.nearest_matching`, but finds
        items nearest a given item already in the index, instead of an
        arbitrary vector.

        :param item: search item
        :param n: number of items to return
        :param check: function to call on each item
        :returns: a generator yielding up to ``n`` items
        """

        for item in self.nearest_matching(self.vec(item), n, check):
            yield item

[docs]    def dist(self, a, b):
        """Returns the distance between two items.

        :param a: first item
        :param b: second item
        :returns: distance between ``a`` and ``b``
        """
        return self.backend.get_distance(self.id_map[a], self.id_map[b])

[docs]    def vec(self, item):
        """Returns the vector for an item.

        This method returns the vector that was originally provided when
        indexing the specified item. (Depending on how it was originally
        specified, they may have been converted to a different data type; e.g.,
        integer vectors are converted to floats.)

        :param item: item to lookup
        :returns: vector for item
        """
        return self.backend.get_item_vector(self.id_map[item])

    def __len__(self):
        """Returns the number of items in the vector"""
        return len(self.corpus)

[docs]    def save(self, prefix):
        """Saves the index to disk.

        This method saves the index to disk. Each backend manages serialization
        a little bit differently: consult the documentation and source code for
        more details. For example, because Annoy indexes can't be serialized
        with `pickle`, the Annoy backend's implementation produces two files:
        the serialized Annoy index, and a pickle with the other data from the
        object.

        This method's parameter specifies the "prefix" to use for these files.

        :param prefix: filename prefix for Annoy index and object data
        :returns: None
        """

        import pickle
        with open(prefix + "-data.pkl", "wb") as fh:
            pickle.dump({
                'id_map': self.id_map,
                'corpus': self.corpus,
                'i': self.i,
                'built': self.built,
                'metric': self.metric,
                'dims': self.dims,
                '_backend_class': self.backend.__class__
            }, fh)
        self.backend.save(prefix + ".idx")

[docs]    @classmethod
    def load(cls, prefix):
        """Restores a previously-saved index.

        This class method restores a previously-saved index using the specified
        file prefix.

        :param prefix: prefix used when saving
        :returns: SimpleNeighbors object restored from specified files
        """

        with open(prefix + "-data.pkl", "rb") as fh:
            data = pickle.load(fh)
        newobj = cls(
            dims=data['dims'],
            metric=data['metric'],
            backend=data['_backend_class']
        )
        newobj.id_map = data['id_map']
        newobj.corpus = data['corpus']
        newobj.i = data['i']
        newobj.built = data['built']
        newobj.backend.load(prefix + ".idx")
        return newobj