From 7dcc1472fda88f3b00cfe33094032538a2004614 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 27 Feb 2023 13:23:45 -0800 Subject: [PATCH 01/35] adding linear search, faiss ann search, cached storage, and redis storage. Also refactoring indexer class for easing implementation of indexers that would depend on packages that include both search and storage. --- tensorflow_similarity/base_indexer.py | 435 ++++++++++++++++++ tensorflow_similarity/indexer.py | 344 +------------- tensorflow_similarity/search/__init__.py | 2 + tensorflow_similarity/search/faiss_search.py | 227 +++++++++ tensorflow_similarity/search/linear_search.py | 183 ++++++++ tensorflow_similarity/stores/__init__.py | 2 + tensorflow_similarity/stores/cached_store.py | 228 +++++++++ tensorflow_similarity/stores/redis_store.py | 191 ++++++++ 8 files changed, 1290 insertions(+), 322 deletions(-) create mode 100644 tensorflow_similarity/base_indexer.py create mode 100644 tensorflow_similarity/search/faiss_search.py create mode 100644 tensorflow_similarity/search/linear_search.py create mode 100644 tensorflow_similarity/stores/cached_store.py create mode 100644 tensorflow_similarity/stores/redis_store.py diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py new file mode 100644 index 00000000..ffafcdfe --- /dev/null +++ b/tensorflow_similarity/base_indexer.py @@ -0,0 +1,435 @@ +from abc import ABC, abstractmethod +import numpy as np +import tensorflow as tf +from .types import CalibrationResults, FloatTensor, Lookup, PandasDataFrame, Tensor +from collections.abc import Mapping, MutableMapping, Sequence +from .retrieval_metrics import RetrievalMetric +from .distances import Distance, distance_canonicalizer +from .evaluators import Evaluator, MemoryEvaluator +from .matchers import ClassificationMatch, make_classification_matcher +from .retrieval_metrics import RetrievalMetric +from .utils import unpack_lookup_distances, unpack_lookup_labels +from collections import defaultdict, deque + + +from .classification_metrics import ( + ClassificationMetric, + F1Score, + make_classification_metric, +) +from .matchers import ClassificationMatch, make_classification_matcher +from tabulate import tabulate + + + +class BaseIndexer(ABC): + def __init__(self, distance, embedding_output, embedding_size, evaluator, + stat_buffer_size): + distance = distance_canonicalizer(distance) + self.distance = distance # needed for save()/load() + self.embedding_output = embedding_output + self.embedding_size = embedding_size + + # internal structure naming + # FIXME support custom objects + self.evaluator_type = evaluator + + # stats configuration + self.stat_buffer_size = stat_buffer_size + + # calibration + self.is_calibrated = False + self.calibration_metric: ClassificationMetric = F1Score() + self.cutpoints: Mapping[str, Mapping[str, float | str]] = {} + self.calibration_thresholds: Mapping[str, np.ndarray] = {} + + return + + # evaluation related functions + def evaluate_retrieval( + self, + predictions: FloatTensor, + target_labels: Sequence[int], + retrieval_metrics: Sequence[RetrievalMetric], + verbose: int = 1, + ) -> dict[str, np.ndarray]: + """Evaluate the quality of the index against a test dataset. + + Args: + predictions: TF similarity model predictions, may be a multi-headed + output. + + target_labels: Sequence of the expected labels associated with the + embedded queries. + + retrieval_metrics: list of + [RetrievalMetric()](retrieval_metrics/overview.md) to compute. + + verbose (int, optional): Display results if set to 1 otherwise + results are returned silently. Defaults to 1. + + Returns: + Dictionary of metric results where keys are the metric names and + values are the metrics values. + """ + # Determine the maximum number of neighbors needed by the retrieval + # metrics because we do a single lookup. + k = 1 + for m in retrieval_metrics: + if not isinstance(m, RetrievalMetric): + raise ValueError( + m, + "is not a valid RetrivalMetric(). The " + "RetrivialMetric() must be instantiated with " + "a valid K.", + ) + if m.k > k: + k = m.k + + # Add one more K to handle the case where we drop the closest lookup. + # This ensures that we always have enough lookups in the result set. + k += 1 + + # Find NN + lookups = self.batch_lookup(predictions, k=k, verbose=verbose) + + # Evaluate them + return self.evaluator.evaluate_retrieval( + retrieval_metrics=retrieval_metrics, + target_labels=target_labels, + lookups=lookups, + ) + + def evaluate_classification( + self, + predictions: FloatTensor, + target_labels: Sequence[int], + distance_thresholds: Sequence[float] | FloatTensor, + metrics: Sequence[str | ClassificationMetric] = ["f1"], + matcher: str | ClassificationMatch = "match_nearest", + k: int = 1, + verbose: int = 1, + ) -> dict[str, np.ndarray]: + """Evaluate the classification performance. + + Compute the classification metrics given a set of queries, lookups, and + distance thresholds. + + Args: + predictions: TF similarity model predictions, may be a multi-headed + output. + + target_labels: Sequence of expected labels for the lookups. + + distance_thresholds: A 1D tensor denoting the distances points at + which we compute the metrics. + + metrics: The set of classification metrics. + + matcher: {'match_nearest', 'match_majority_vote'} or + ClassificationMatch object. Defines the classification matching, + e.g., match_nearest will count a True Positive if the query_label + is equal to the label of the nearest neighbor and the distance is + less than or equal to the distance threshold. + + distance_rounding: How many digit to consider to + decide if the distance changed. Defaults to 8. + + verbose: Be verbose. Defaults to 1. + Returns: + A Mapping from metric name to the list of values computed for each + distance threshold. + """ + combined_metrics: list[ClassificationMetric] = [make_classification_metric(m) for m in metrics] + + lookups = self.batch_lookup(predictions, k=k, verbose=verbose) + + # we also convert to np.ndarray first to avoid a slow down if + # convert_to_tensor is called on a list. + query_labels = tf.convert_to_tensor(np.array(target_labels)) + + # TODO(ovallis): The float type should be derived from the model. + lookup_distances = unpack_lookup_distances(lookups, dtype="float32") + lookup_labels = unpack_lookup_labels(lookups, dtype=query_labels.dtype) + thresholds: FloatTensor = tf.cast( + tf.convert_to_tensor(distance_thresholds), + dtype=lookup_distances.dtype, + ) + + results = self.evaluator.evaluate_classification( + query_labels=query_labels, + lookup_labels=lookup_labels, + lookup_distances=lookup_distances, + distance_thresholds=thresholds, + metrics=combined_metrics, + matcher=matcher, + verbose=verbose, + ) + + return results + + def calibrate( + self, + predictions: FloatTensor, + target_labels: Sequence[int], + thresholds_targets: MutableMapping[str, float], + calibration_metric: str | ClassificationMetric = "f1_score", # noqa + k: int = 1, + matcher: str | ClassificationMatch = "match_nearest", + extra_metrics: Sequence[str | ClassificationMetric] = [ + "precision", + "recall", + ], # noqa + rounding: int = 2, + verbose: int = 1, + ) -> CalibrationResults: + """Calibrate model thresholds using a test dataset. + + FIXME: more detailed explanation. + + Args: + predictions: TF similarity model predictions, may be a multi-headed + output. + + target_labels: Sequence of the expected labels associated with the + embedded queries. + + thresholds_targets: Dict of performance targets to (if possible) + meet with respect to the `calibration_metric`. + + calibration_metric: [ClassificationMetric()](metrics/overview.md) + used to evaluate the performance of the index. + + k: How many neighbors to use during the calibration. + Defaults to 1. + + matcher: {'match_nearest', 'match_majority_vote'} or + ClassificationMatch object. Defines the classification matching, + e.g., match_nearest will count a True Positive if the query_label + is equal to the label of the nearest neighbor and the distance is + less than or equal to the distance threshold. + Defaults to 'match_nearest'. + + extra_metrics: list of additional + `tf.similarity.classification_metrics.ClassificationMetric()` to + compute and report. Defaults to ['precision', 'recall']. + + rounding: Metric rounding. Default to 2 digits. + + verbose: Be verbose and display calibration results. Defaults to 1. + + Returns: + CalibrationResults containing the thresholds and cutpoints Dicts. + """ + + # find NN + lookups = self.batch_lookup(predictions, k=k, verbose=verbose) + + # making sure our metrics are all ClassificationMetric objects + calibration_metric = make_classification_metric(calibration_metric) + + combined_metrics: list[ClassificationMetric] = [make_classification_metric(m) for m in extra_metrics] + + # running calibration + calibration_results = self.evaluator.calibrate( + target_labels=target_labels, + lookups=lookups, + thresholds_targets=thresholds_targets, + calibration_metric=calibration_metric, + matcher=matcher, + extra_metrics=combined_metrics, + metric_rounding=rounding, + verbose=verbose, + ) + + # display cutpoint results if requested + if verbose: + headers = ["name", "value", "distance"] # noqa + cutpoints = list(calibration_results.cutpoints.values()) + # dynamically find which metrics we need. We only need to look at + # the first cutpoints dictionary as all subsequent ones will have + # the same metric keys. + for metric_name in cutpoints[0].keys(): + if metric_name not in headers: + headers.append(metric_name) + + rows = [] + for data in cutpoints: + rows.append([data[v] for v in headers]) + print("\n", tabulate(rows, headers=headers)) + + # store info for serialization purpose + self.is_calibrated = True + self.calibration_metric = calibration_metric + self.cutpoints = calibration_results.cutpoints + self.calibration_thresholds = calibration_results.thresholds + return calibration_results + + def match( + self, + predictions: FloatTensor, + no_match_label: int = -1, + k=1, + matcher: str | ClassificationMatch = "match_nearest", + verbose: int = 1, + ) -> dict[str, list[int]]: + """Match embeddings against the various cutpoints thresholds + + Args: + predictions: TF similarity model predictions, may be a multi-headed + output. + + no_match_label: What label value to assign when there is no match. + Defaults to -1. + + k: How many neighboors to use during the calibration. + Defaults to 1. + + matcher: {'match_nearest', 'match_majority_vote'} or + ClassificationMatch object. Defines the classification matching, + e.g., match_nearest will count a True Positive if the query_label + is equal to the label of the nearest neighbor and the distance is + less than or equal to the distance threshold. + + verbose: display progression. Default to 1. + + Notes: + + 1. It is up to the [`SimilarityModel.match()`](similarity_model.md) + code to decide which of cutpoints results to use / show to the + users. This function returns all of them as there is little + performance downside to do so and it makes the code clearer + and simpler. + + 2. The calling function is responsible to return the list of class + matched to allows implementation to use additional criteria if they + choose to. + + Returns: + Dict of cutpoint names mapped to lists of matches. + """ + matcher = make_classification_matcher(matcher) + + lookups = self.batch_lookup(predictions, k=k, verbose=verbose) + + lookup_distances = unpack_lookup_distances(lookups, dtype=predictions.dtype) + # TODO(ovallis): The int type should be derived from the model. + lookup_labels = unpack_lookup_labels(lookups, dtype="int32") + + if verbose: + pb = tqdm( + total=len(lookup_distances) * len(self.cutpoints), + desc="matching embeddings", + ) + + matches: defaultdict[str, list[int]] = defaultdict(list) + for cp_name, cp_data in self.cutpoints.items(): + distance_threshold = float(cp_data["distance"]) + + pred_labels, pred_dist = matcher.derive_match( + lookup_labels=lookup_labels, lookup_distances=lookup_distances + ) + + for label, distance in zip(pred_labels, pred_dist): + if distance <= distance_threshold: + label = int(label) + else: + label = no_match_label + + matches[cp_name].append(label) + + if verbose: + pb.update() + + if verbose: + pb.close() + + return matches + + @abstractmethod + def add( + self, + prediction: FloatTensor, + label: int | None = None, + data: Tensor = None, + build: bool = True, + verbose: int = 1, + ): + """Add a single embedding to the indexer + + Args: + prediction: TF similarity model prediction, may be a multi-headed + output. + + label: Label(s) associated with the + embedding. Defaults to None. + + data: Input data associated with + the embedding. Defaults to None. + + build: Rebuild the index after insertion. + Defaults to True. Set it to false if you would like to add + multiples batches/points and build it manually once after. + + verbose: Display progress if set to 1. + Defaults to 1. + """ + + @abstractmethod + def batch_add( + self, + predictions: FloatTensor, + labels: Sequence[int] | None = None, + data: Tensor | None = None, + build: bool = True, + verbose: int = 1, + ): + """Add a batch of embeddings to the indexer + + Args: + predictions: TF similarity model predictions, may be a multi-headed + output. + + labels: label(s) associated with the embedding. Defaults to None. + + datas: input data associated with the embedding. Defaults to None. + + build: Rebuild the index after insertion. + Defaults to True. Set it to false if you would like to add + multiples batches/points and build it manually once after. + + verbose: Display progress if set to 1. Defaults to 1. + """ + + @abstractmethod + def single_lookup(self, prediction: FloatTensor, k: int = 5) -> list[Lookup]: + """Find the k closest matches of a given embedding + + Args: + prediction: TF similarity model prediction, may be a multi-headed + output. + + k: Number of nearest neighbors to lookup. Defaults to 5. + Returns + list of the k nearest neighbors info: + list[Lookup] + """ + + + @abstractmethod + def batch_lookup(self, predictions: FloatTensor, k: int = 5, verbose: int = 1) -> list[list[Lookup]]: + + """Find the k closest matches for a set of embeddings + + Args: + predictions: TF similarity model predictions, may be a multi-headed + output. + + k: Number of nearest neighbors to lookup. Defaults to 5. + + verbose: Be verbose. Defaults to 1. + + Returns + list of list of k nearest neighbors: + list[list[Lookup]] + """ diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 3fe72247..f2e0518c 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -21,6 +21,18 @@ from collections.abc import Mapping, MutableMapping, Sequence from pathlib import Path from time import time +from .base_indexer import BaseIndexer +from typing import ( + DefaultDict, + Deque, + Dict, + List, + Mapping, + MutableMapping, + Optional, + Sequence, + Union, +) import numpy as np import tensorflow as tf @@ -44,7 +56,7 @@ from .utils import unpack_lookup_distances, unpack_lookup_labels -class Indexer: +class Indexer(BaseIndexer): """Indexing system that allows to efficiently find nearest embeddings by indexing known embeddings and make them searchable using an [Approximate Nearest Neighbors Search] @@ -67,11 +79,11 @@ class Indexer: def __init__( self, embedding_size: int, - distance: Distance | str = "cosine", - search: Search | str = "nmslib", - kv_store: Store | str = "memory", - evaluator: Evaluator | str = "memory", - embedding_output: int | None = None, + distance: Union[Distance, str] = "cosine", + search: Union[Search, str] = "nmslib", + kv_store: Union[Store, str] = "memory", + evaluator: Union[Evaluator, str] = "memory", + embedding_output: int = None, stat_buffer_size: int = 1000, ) -> None: """Index embeddings to make them searchable via KNN @@ -104,26 +116,12 @@ def __init__( Raises: ValueError: Invalid search framework or key value store. """ - distance = distance_canonicalizer(distance) - self.distance = distance # needed for save()/load() - self.embedding_output = embedding_output - self.embedding_size = embedding_size - + super().__init__(distance, embedding_output, embedding_size, evaluator, + stat_buffer_size) # internal structure naming # FIXME support custom objects self.search_type = search self.kv_store_type = kv_store - self.evaluator_type = evaluator - - # stats configuration - self.stat_buffer_size = stat_buffer_size - - # calibration - self.is_calibrated = False - self.calibration_metric: ClassificationMetric = F1Score() - self.cutpoints: Mapping[str, Mapping[str, float | str]] = {} - self.calibration_thresholds: Mapping[str, np.ndarray] = {} - # initialize internal structures self._init_structures() @@ -136,6 +134,8 @@ def _init_structures(self) -> None: if self.search_type == "nmslib": self.search: Search = NMSLibSearch(distance=self.distance, dim=self.embedding_size) + elif self.search_type == "linear": + self.search = LinearSearch(distance=self.distance, dim=embedding_size) elif isinstance(self.search_type, Search): self.search = self.search_type else: @@ -380,306 +380,6 @@ def batch_lookup(self, predictions: FloatTensor, k: int = 5, verbose: int = 1) - return batch_lookups - # evaluation related functions - def evaluate_retrieval( - self, - predictions: FloatTensor, - target_labels: Sequence[int], - retrieval_metrics: Sequence[RetrievalMetric], - verbose: int = 1, - ) -> dict[str, np.ndarray]: - """Evaluate the quality of the index against a test dataset. - - Args: - predictions: TF similarity model predictions, may be a multi-headed - output. - - target_labels: Sequence of the expected labels associated with the - embedded queries. - - retrieval_metrics: list of - [RetrievalMetric()](retrieval_metrics/overview.md) to compute. - - verbose (int, optional): Display results if set to 1 otherwise - results are returned silently. Defaults to 1. - - Returns: - Dictionary of metric results where keys are the metric names and - values are the metrics values. - """ - # Determine the maximum number of neighbors needed by the retrieval - # metrics because we do a single lookup. - k = 1 - for m in retrieval_metrics: - if not isinstance(m, RetrievalMetric): - raise ValueError( - m, - "is not a valid RetrivalMetric(). The " - "RetrivialMetric() must be instantiated with " - "a valid K.", - ) - if m.k > k: - k = m.k - - # Add one more K to handle the case where we drop the closest lookup. - # This ensures that we always have enough lookups in the result set. - k += 1 - - # Find NN - lookups = self.batch_lookup(predictions, k=k, verbose=verbose) - - # Evaluate them - return self.evaluator.evaluate_retrieval( - retrieval_metrics=retrieval_metrics, - target_labels=target_labels, - lookups=lookups, - ) - - def evaluate_classification( - self, - predictions: FloatTensor, - target_labels: Sequence[int], - distance_thresholds: Sequence[float] | FloatTensor, - metrics: Sequence[str | ClassificationMetric] = ["f1"], - matcher: str | ClassificationMatch = "match_nearest", - k: int = 1, - verbose: int = 1, - ) -> dict[str, np.ndarray]: - """Evaluate the classification performance. - - Compute the classification metrics given a set of queries, lookups, and - distance thresholds. - - Args: - predictions: TF similarity model predictions, may be a multi-headed - output. - - target_labels: Sequence of expected labels for the lookups. - - distance_thresholds: A 1D tensor denoting the distances points at - which we compute the metrics. - - metrics: The set of classification metrics. - - matcher: {'match_nearest', 'match_majority_vote'} or - ClassificationMatch object. Defines the classification matching, - e.g., match_nearest will count a True Positive if the query_label - is equal to the label of the nearest neighbor and the distance is - less than or equal to the distance threshold. - - distance_rounding: How many digit to consider to - decide if the distance changed. Defaults to 8. - - verbose: Be verbose. Defaults to 1. - Returns: - A Mapping from metric name to the list of values computed for each - distance threshold. - """ - combined_metrics: list[ClassificationMetric] = [make_classification_metric(m) for m in metrics] - - lookups = self.batch_lookup(predictions, k=k, verbose=verbose) - - # we also convert to np.ndarray first to avoid a slow down if - # convert_to_tensor is called on a list. - query_labels = tf.convert_to_tensor(np.array(target_labels)) - - lookup_distances = unpack_lookup_distances(lookups, dtype=tf.keras.backend.floatx()) - lookup_labels = unpack_lookup_labels(lookups, dtype=query_labels.dtype) - thresholds: FloatTensor = tf.cast( - tf.convert_to_tensor(distance_thresholds), - dtype=tf.keras.backend.floatx(), - ) - - results = self.evaluator.evaluate_classification( - query_labels=query_labels, - lookup_labels=lookup_labels, - lookup_distances=lookup_distances, - distance_thresholds=thresholds, - metrics=combined_metrics, - matcher=matcher, - verbose=verbose, - ) - - return results - - def calibrate( - self, - predictions: FloatTensor, - target_labels: Sequence[int], - thresholds_targets: MutableMapping[str, float], - calibration_metric: str | ClassificationMetric = "f1_score", # noqa - k: int = 1, - matcher: str | ClassificationMatch = "match_nearest", - extra_metrics: Sequence[str | ClassificationMetric] = [ - "precision", - "recall", - ], # noqa - rounding: int = 2, - verbose: int = 1, - ) -> CalibrationResults: - """Calibrate model thresholds using a test dataset. - - FIXME: more detailed explanation. - - Args: - predictions: TF similarity model predictions, may be a multi-headed - output. - - target_labels: Sequence of the expected labels associated with the - embedded queries. - - thresholds_targets: Dict of performance targets to (if possible) - meet with respect to the `calibration_metric`. - - calibration_metric: [ClassificationMetric()](metrics/overview.md) - used to evaluate the performance of the index. - - k: How many neighbors to use during the calibration. - Defaults to 1. - - matcher: {'match_nearest', 'match_majority_vote'} or - ClassificationMatch object. Defines the classification matching, - e.g., match_nearest will count a True Positive if the query_label - is equal to the label of the nearest neighbor and the distance is - less than or equal to the distance threshold. - Defaults to 'match_nearest'. - - extra_metrics: list of additional - `tf.similarity.classification_metrics.ClassificationMetric()` to - compute and report. Defaults to ['precision', 'recall']. - - rounding: Metric rounding. Default to 2 digits. - - verbose: Be verbose and display calibration results. Defaults to 1. - - Returns: - CalibrationResults containing the thresholds and cutpoints Dicts. - """ - - # find NN - lookups = self.batch_lookup(predictions, k=k, verbose=verbose) - - # making sure our metrics are all ClassificationMetric objects - calibration_metric = make_classification_metric(calibration_metric) - - combined_metrics: list[ClassificationMetric] = [make_classification_metric(m) for m in extra_metrics] - - # running calibration - calibration_results = self.evaluator.calibrate( - target_labels=target_labels, - lookups=lookups, - thresholds_targets=thresholds_targets, - calibration_metric=calibration_metric, - matcher=matcher, - extra_metrics=combined_metrics, - metric_rounding=rounding, - verbose=verbose, - ) - - # display cutpoint results if requested - if verbose: - headers = ["name", "value", "distance"] # noqa - cutpoints = list(calibration_results.cutpoints.values()) - # dynamically find which metrics we need. We only need to look at - # the first cutpoints dictionary as all subsequent ones will have - # the same metric keys. - for metric_name in cutpoints[0].keys(): - if metric_name not in headers: - headers.append(metric_name) - - rows = [] - for data in cutpoints: - rows.append([data[v] for v in headers]) - print("\n", tabulate(rows, headers=headers)) - - # store info for serialization purpose - self.is_calibrated = True - self.calibration_metric = calibration_metric - self.cutpoints = calibration_results.cutpoints - self.calibration_thresholds = calibration_results.thresholds - return calibration_results - - def match( - self, - predictions: FloatTensor, - no_match_label: int = -1, - k=1, - matcher: str | ClassificationMatch = "match_nearest", - verbose: int = 1, - ) -> dict[str, list[int]]: - """Match embeddings against the various cutpoints thresholds - - Args: - predictions: TF similarity model predictions, may be a multi-headed - output. - - no_match_label: What label value to assign when there is no match. - Defaults to -1. - - k: How many neighboors to use during the calibration. - Defaults to 1. - - matcher: {'match_nearest', 'match_majority_vote'} or - ClassificationMatch object. Defines the classification matching, - e.g., match_nearest will count a True Positive if the query_label - is equal to the label of the nearest neighbor and the distance is - less than or equal to the distance threshold. - - verbose: display progression. Default to 1. - - Notes: - - 1. It is up to the [`SimilarityModel.match()`](similarity_model.md) - code to decide which of cutpoints results to use / show to the - users. This function returns all of them as there is little - performance downside to do so and it makes the code clearer - and simpler. - - 2. The calling function is responsible to return the list of class - matched to allows implementation to use additional criteria if they - choose to. - - Returns: - Dict of cutpoint names mapped to lists of matches. - """ - matcher = make_classification_matcher(matcher) - - lookups = self.batch_lookup(predictions, k=k, verbose=verbose) - - lookup_distances = unpack_lookup_distances(lookups, dtype=tf.keras.backend.floatx()) - # TODO(ovallis): The int type should be derived from the model. - lookup_labels = unpack_lookup_labels(lookups, dtype="int32") - - if verbose: - pb = tqdm( - total=len(lookup_distances) * len(self.cutpoints), - desc="matching embeddings", - ) - - matches: defaultdict[str, list[int]] = defaultdict(list) - for cp_name, cp_data in self.cutpoints.items(): - distance_threshold = float(cp_data["distance"]) - - pred_labels, pred_dist = matcher.derive_match( - lookup_labels=lookup_labels, lookup_distances=lookup_distances - ) - - for label, distance in zip(pred_labels, pred_dist): - if distance <= distance_threshold: - label = int(label) - else: - label = no_match_label - - matches[cp_name].append(label) - - if verbose: - pb.update() - - if verbose: - pb.close() - - return matches - def save(self, path: str, compression: bool = True): """Save the index to disk diff --git a/tensorflow_similarity/search/__init__.py b/tensorflow_similarity/search/__init__.py index d1ac0b30..38466f2c 100644 --- a/tensorflow_similarity/search/__init__.py +++ b/tensorflow_similarity/search/__init__.py @@ -37,6 +37,8 @@ # Disable the INFO logging from NMSLIB logging.getLogger("nmslib").setLevel(logging.WARNING) +from .faiss_search import FaissSearch # noqa +from .linear_search import LinearSearch from .nmslib_search import NMSLibSearch # noqa from .search import Search # noqa from .utils import make_search # noqa diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py new file mode 100644 index 00000000..e4ac89b1 --- /dev/null +++ b/tensorflow_similarity/search/faiss_search.py @@ -0,0 +1,227 @@ +"""The module to handle FAISS search.""" + +from collections.abc import Mapping, Sequence +from termcolor import cprint +from .search import Search +import faiss +import numpy as np +from tensorflow_similarity.distances import Distance +from tensorflow_similarity.types import FloatTensor +from pathlib import Path +from typing import Any + + +class FaissSearch(Search): + """This class implements the Faiss ANN interface. + + It implements the Search interface. + """ + + def __init__( + self, + distance: Distance | str, + dim: int, + verbose: int = 0, + name: str | None = None, + algo="ivfpq", + m=8, + nbits=8, + nlist=1024, + nprobe=1, + normalize=True, + ): + """Initiate FAISS indexer + + Args: + d: number of dimensions + m: number of centroid IDs in final compressed vectors. d must be divisible + by m + nbits: number of bits in each centroid + nlist: how many Voronoi cells (must be greater than or equal to 2**nbits) + nprobe: how many of the nearest cells to include in search + """ + super().__init__(distance=distance, dim=dim, verbose=verbose, name=name) + self.algo = algo + self.m = m # number of bits per subquantizer + self.nbits = nbits + self.nlist = nlist + self.nprobe = nprobe + self.normalize = normalize + self.built = False + + if verbose: + t_msg = [ + "\n|-Initialize NMSLib Index", + f"| - algo: {self.algo}", + f"| - m: {self.m}", + f"| - nbits: {self.nbits}", + f"| - nlist: {self.nlist}", + f"| - nprobe: {self.nprobe}", + f"| - normalize: {self.normalize}", + f"| - query_params: {self.query_params}", + ] + cprint("\n".join(t_msg) + "\n", "green") + + if self.algo == "ivfpq": + assert dim % m == 0, f"dim={dim}, m={m}" + if self.algo == "ivfpq": + metric = faiss.METRIC_L2 + prefix = "" + if distance == "cosine": + prefix = "L2norm," + metric = faiss.METRIC_INNER_PRODUCT + # this distance requires both the input and query vectors to be normalized + ivf_string = f"IVF{nlist}," + pq_string = f"PQ{m}x{nbits}" + factory_string = prefix + ivf_string + pq_string + self.index = faiss.index_factory(dim, factory_string, metric) + # quantizer = faiss.IndexFlatIP( + # dim + # ) # we keep the same L2 distance flat index + # self.index = faiss.IndexIVFPQ( + # quantizer, dim, nlist, m, nbits, metric=faiss.METRIC_INNER_PRODUCT + # ) + # else: + # quantizer = faiss.IndexFlatL2( + # dim + # ) # we keep the same L2 distance flat index + # self.index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, nbits) + self.index.nprobe = nprobe # set how many of nearest cells to search + elif algo == "flat": + if distance == "cosine": + # this is exact match using cosine/dot-product Distance + self.index = faiss.IndexFlatIP(dim) + else: + # this is exact match using L2 distance + self.index = faiss.IndexFlatL2(dim) + + def is_built(self): + return self.built + + def needs_building(self): + if self.algo == "flat": + return False + else: + return not self.index.is_trained + + def build_index(self, samples, **kwargss): + if self.algo == "ivfpq": + if self.normalize: + faiss.normalize_L2(samples) + self.index.train(samples) # we must train the index to cluster into cells + self.built = True + + def batch_lookup( + self, embeddings: FloatTensor, k: int = 5 + ) -> tuple[list[list[int]], list[list[float]]]: + """Find embeddings K nearest neighboors embeddings. + + Args: + embedding: Batch of query embeddings as predicted by the model. + k: Number of nearest neighboors embedding to lookup. Defaults to 5. + """ + + if self.normalize: + faiss.normalize_L2(embeddings) + D, I = self.index.search(embeddings, k) + return I, D + + def lookup( + self, embedding: FloatTensor, k: int = 5 + ) -> tuple[list[int], list[float]]: + """Find embedding K nearest neighboors embeddings. + + Args: + embedding: Query embedding as predicted by the model. + k: Number of nearest neighboors embedding to lookup. Defaults to 5. + """ + int_embedding = np.array([embedding], dtype=np.float32) + if self.normalize: + faiss.normalize_L2(int_embedding) + D, I = self.index.search(int_embedding, k) + return I[0], D[0] + + def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): + """Add a single embedding to the search index. + + Args: + embedding: The embedding to index as computed by the similarity model. + idx: Embedding id as in the index table. Returned with the embedding to + allow to lookup the data associated with a given embedding. + """ + int_embedding = np.array([embedding], dtype=np.float32) + if self.normalize: + faiss.normalize_L2(int_embedding) + if self.algo != "flat": + self.index.add_with_ids(int_embedding) + else: + self.index.add(int_embedding) + + def batch_add( + self, + embeddings: FloatTensor, + idxs: Sequence[int], + verbose: int = 1, + normalize: bool = True, + **kwargs, + ): + """Add a batch of embeddings to the search index. + + Args: + embeddings: List of embeddings to add to the index. + idxs (int): Embedding ids as in the index table. Returned with the + embeddings to allow to lookup the data associated with the returned + embeddings. + verbose: Be verbose. Defaults to 1. + """ + if self.normalize: + faiss.normalize_L2(embeddings) + if self.algo != "flat": + # flat does not accept indexes as parameters and assumes incremental + # indexes + self.index.add_with_ids(embeddings, idxs) + else: + self.index.add(embeddings) + + def save(self, path: str): + """Serializes the index data on disk + + Args: + path: where to store the data + """ + chunk = faiss.serialize_index(self.index) + np.save(self.__make_fname(path), chunk) + + def __make_fname(self, path): + return str(Path(path) / "faiss_index.npy") + + def load(self, path: str): + """load index on disk + + Args: + path: where to store the data + """ + self.index = faiss.deserialize_index( + np.load(self.__make_fname(path)) + ) # identical to index + + def get_config(self) -> dict[str, Any]: + """Contains the search configuration. + + Returns: + A Python dict containing the configuration of the search obj. + """ + config = { + "distance": self.distance.name, + "dim": self.dim, + "algo": self.algo, + "m": self.m, + "nlist": self.nlist, + "nprobe": self.nprobe, + "normalize": self.normalize, + "verbose": self.verbose, + "name": self.name, + "canonical_name": self.__class__.__name__, + } + + return config diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py new file mode 100644 index 00000000..65cf536f --- /dev/null +++ b/tensorflow_similarity/search/linear_search.py @@ -0,0 +1,183 @@ +"""The module to handle Linear search.""" + +from collections.abc import Sequence +from .search import Search +from tensorflow_similarity.distances import Distance +from tensorflow_similarity.types import FloatTensor +from pathlib import Path +from typing import Any +import numpy as np +import tensorflow as tf +import pickle +import json +from termcolor import cprint + +INITIAL_DB_SIZE = 10000 +DB_SIZE_STEPS = 10000 + + +class LinearSearch(Search): + """This class implements the Linear Search interface. + + It implements the Search interface. + """ + + def __init__( + self, + distance: Distance | str, + dim: int, + verbose: int = 0, + name: str | None = None, + ): + """Initiate Linear indexer. + + Args: + d: number of dimensions + m: number of centroid IDs in final compressed vectors. d must be divisible + by m + nbits: number of bits in each centroid + nlist: how many Voronoi cells (must be greater than or equal to 2**nbits) + nprobe: how many of the nearest cells to include in search + """ + super().__init__(distance=distance, dim=dim, verbose=verbose, name=name) + + if verbose: + t_msg = [ + "\n|-Initialize NMSLib Index", + f"| - distance: {self.distance}", + f"| - dim: {self.dim}", + f"| - verbose: {self.verbose}", + f"| - name: {self.name}", + ] + cprint("\n".join(t_msg) + "\n", "green") + self.db = np.empty((INITIAL_DB_SIZE, dim), dtype=np.float32) + self.ids = [] + + + + def is_built(self): + return True + + def needs_building(self): + return False + + def batch_lookup( + self, embeddings: FloatTensor, k: int = 5 + ) -> tuple[list[list[int]], list[list[float]]]: + """Find embeddings K nearest neighboors embeddings. + + Args: + embedding: Batch of query embeddings as predicted by the model. + k: Number of nearest neighboors embedding to lookup. Defaults to 5. + """ + + normalized_query = tf.math.l2_normalize(embeddings, axis=1) + items = len(self.ids) + sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) + similarity, id_idxs = tf.math.top_k(sims, k) + ids_array = np.array(self.ids) + return np.array([ids_array[x.numpy()] for x in id_idxs]), similarity + + def lookup( + self, embedding: FloatTensor, k: int = 5 + ) -> tuple[list[int], list[float]]: + """Find embedding K nearest neighboors embeddings. + + Args: + embedding: Query embedding as predicted by the model. + k: Number of nearest neighboors embedding to lookup. Defaults to 5. + """ + normalized_query = tf.math.l2_normalize(np.array([embedding], dtype=np.float32), axis=1) + items = len(self.ids) + sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) + similarity, id_idxs = tf.math.top_k(sims, k) + ids_array = np.array(self.ids) + return np.array(ids_array[id_idxs[0].numpy()]), similarity[0] + + def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): + """Add a single embedding to the search index. + + Args: + embedding: The embedding to index as computed by the similarity model. + idx: Embedding id as in the index table. Returned with the embedding to + allow to lookup the data associated with a given embedding. + """ + int_embedding = tf.math.l2_normalize(np.array([embedding], dtype=np.float32), axis=1) + items = len(self.ids) + if items + 1 > self.db.shape[0]: + # it's full + new_db = np.empty((len(self.ids) + DB_SIZE_STEPS, self.dim), dtype=np.float32) + new_db[:items] = self.db + self.db = new_db + self.ids.append(idx) + self.db[items] = int_embedding + + def batch_add( + self, + embeddings: FloatTensor, + idxs: Sequence[int], + verbose: int = 1, + normalize: bool = True, + **kwargs, + ): + """Add a batch of embeddings to the search index. + + Args: + embeddings: List of embeddings to add to the index. + idxs (int): Embedding ids as in the index table. Returned with the + embeddings to allow to lookup the data associated with the returned + embeddings. + verbose: Be verbose. Defaults to 1. + """ + int_embeddings = tf.math.l2_normalize(embeddings, axis=1) + items = len(self.ids) + if items + len(embeddings) > self.db.shape[0]: + # it's full + new_db = np.empty((((items + len(embeddings) + DB_SIZE_STEPS) // DB_SIZE_STEPS) * DB_SIZE_STEPS, self.dim), dtype=np.float32) + new_db[:items] = self.db + self.db = new_db + self.ids.extend(idxs) + self.db[items:items+len(embeddings)] = int_embeddings + + def __make_file_path(self, path): + return path / "index.pickle" + + def save(self, path: str): + """Serializes the index data on disk + + Args: + path: where to store the data + """ + with open(self.__make_file_path(path), "wb") as f: + pickle.dump((self.db, self.ids), f) + + def load(self, path: str): + """load index on disk + + Args: + path: where to store the data + """ + with open(self.__make_file_path(path), "rb") as f: + data = pickle.load(f) + self.db = data[0] + self.ids = data[1] + + def __make_config_path(self, path): + return path / "config.json" + + def __save_config(self): + with open(self.__make_config_file_path(path), "wt") as f: + json.dump(self.get_config(), f) + + def get_config(self) -> dict[str, Any]: + """Contains the search configuration. + + Returns: + A Python dict containing the configuration of the search obj. + """ + config = { + "distance": self.distance.name, + "dim": self.dim, + } + + return config diff --git a/tensorflow_similarity/stores/__init__.py b/tensorflow_similarity/stores/__init__.py index ea2f5772..a7c71e31 100644 --- a/tensorflow_similarity/stores/__init__.py +++ b/tensorflow_similarity/stores/__init__.py @@ -29,3 +29,5 @@ from .memory_store import MemoryStore # noqa from .store import Store # noqa +from .cached_store import CachedStore # noqa +from .redis_store import RedisStore # noqa diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py new file mode 100644 index 00000000..1cfdc55b --- /dev/null +++ b/tensorflow_similarity/stores/cached_store.py @@ -0,0 +1,228 @@ +# Copyright 2021 The TensorFlow Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import io +from collections.abc import Sequence +from pathlib import Path + +import numpy as np +import pandas as pd +import tensorflow as tf +import pickle +import shutil +import dbm +import json +import math + +from tensorflow_similarity.types import FloatTensor, PandasDataFrame, Tensor + +from .store import Store + + +class CachedStore(Store): + """Efficient cached dataset store""" + + def __init__(self, shard_size=1000000) -> None: + # We are using a native python cached dictionary + # db[id] = pickle((embedding, label, data)) + self.db: list[dict[str, str]] = [] + self.shard_size = shard_size + self.num_items: int = 0 + self.path: str = "." + + def __get_shard_file_path(self, shard_no): + return f'{self.path}/cache{shard_no}' + + def __make_new_shard(self, shard_no: int): + return dbm.open(self.__get_shard_file_path(shard_no), 'c') + + def __add_new_shard(self): + shard_no = len(self.db) + self.db.append(self.__make_new_shard(shard_no)) + + def __reopen_all_shards(self): + for shard_no in range(len(self.db)): + self.db[shard_no] = self.__make_new_shard(shard_no) + + def add( + self, + embedding: FloatTensor, + label: int | None = None, + data: Tensor | None = None, + ) -> int: + """Add an Embedding record to the key value store. + + Args: + embedding: Embedding predicted by the model. + + label: Class numerical id. Defaults to None. + + data: Data associated with the embedding. Defaults to None. + + Returns: + Associated record id. + """ + idx = self.num_items + shard_no = idx // self.shard_size + if len(self.db) <= shard_no: + self.__add_new_shard() + self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, data)) + self.num_items += 1 + return idx + + def batch_add( + self, + embeddings: Sequence[FloatTensor], + labels: Sequence[int] | None = None, + data: Sequence[Tensor] | None = None, + ) -> list[int]: + """Add a set of embedding records to the key value store. + + Args: + embeddings: Embeddings predicted by the model. + + labels: Class numerical ids. Defaults to None. + + data: Data associated with the embeddings. Defaults to None. + + See: + add() for what a record contains. + + Returns: + List of associated record id. + """ + idxs: list[int] = [] + for i, embedding in enumerate(embeddings): + idx = i + self.num_items + label = None if labels is None else labels[i] + rec_data = None if data is None else data[i] + shard_no = idx // self.shard_size + if len(self.db) <= shard_no: + self.__add_new_shard() + self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, rec_data)) + idxs.append(idx) + + return idxs + + def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: + """Get an embedding record from the key value store. + + Args: + idx: Id of the record to fetch. + + Returns: + record associated with the requested id. + """ + + shard_no = idx // self.shard_size + embedding, label, data = pickle.loads(self.db[shard_no][str(idx)]) + return embedding, label, data + + def batch_get(self, idxs: Sequence[int]) -> tuple[list[FloatTensor], list[int | None], list[Tensor | None]]: + """Get embedding records from the key value store. + + Args: + idxs: ids of the records to fetch. + + Returns: + List of records associated with the requested ids. + """ + embeddings = [] + labels = [] + data = [] + for idx in idxs: + e, l, d = self.get(idx) + embeddings.append(e) + labels.append(l) + data.append(d) + return embeddings, labels, data + + def size(self) -> int: + "Number of record in the key value store." + return self.num_items + + def __close_all_shards(self): + for shard in self.db: + shard.close() + + def __copy_shards(self, path): + for shard_no in range(len(self.db)): + shutil.copy(Path(self.__get_shard_file_path(shard_no)).with_suffix('.db'), path) + + def __make_config_file_path(self, path): + return path / "config.json" + + def __save_config(self, path): + with open(self.__make_config_file_path(path), "wt") as f: + json.dump(self.get_config(), f) + + def __set_config(self, num_items, shard_size): + self.num_items = num_items + self.shard_size = shard_size + + def __load_config(self, path): + with open(self.__make_config_file_path(path), "rt") as f: + self.__set_config(**json.load(f)) + + def save(self, path: str, compression: bool = True) -> None: + """Serializes index on disk. + + Args: + path: where to store the data. + compression: Compress index data. Defaults to True. + """ + # Writing to a buffer to avoid read error in np.savez when using GFile. + # See: https://github.com/tensorflow/tensorflow/issues/32090 + self.__close_all_shards() + self.__copy_shards(path) + self.__save_config(path) + self.__reopen_all_shards() + + def get_config(self): + return { + "shard_size": self.shard_size, + "num_items": self.num_items + } + + def load(self, path: str) -> int: + """load index on disk + + Args: + path: which directory to use to store the index data. + + Returns: + Number of records reloaded. + """ + self.__load_config(path) + num_shards = int(math.ceil(self.num_items / self.shard_size)) + self.path = path + for i in range(self.num_items): + self.__add_new_shard() + return self.size() + + def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: + """Export data as a Pandas dataframe. + + Cached store does not fit in memory, therefore we do not implement this. + + Args: + num_records: Number of records to export to the dataframe. + Defaults to 0 (unlimited). + + Returns: + None + """ + + return None diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py new file mode 100644 index 00000000..2a51d55f --- /dev/null +++ b/tensorflow_similarity/stores/redis_store.py @@ -0,0 +1,191 @@ +# Copyright 2021 The TensorFlow Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from collections.abc import Sequence + +import redis + +from .store import Store + +from tensorflow_similarity.types import FloatTensor, PandasDataFrame, Tensor + + +class RedisStore(Store): + """Efficient Redis dataset store""" + + def __init__(self, host="localhost", port=6379, db=0) -> None: + # Currently does not support authentication + self.host = host + self.port = port + self.db = db + self.__connect() + + def add( + self, + embedding: FloatTensor, + label: int | None = None, + data: Tensor | None = None, + ) -> int: + """Add an Embedding record to the key value store. + + Args: + embedding: Embedding predicted by the model. + + label: Class numerical id. Defaults to None. + + data: Data associated with the embedding. Defaults to None. + + Returns: + Associated record id. + """ + num_items = self.__conn.incr("num_items") + idx = num_items - 1 + self.__conn.set(idx, (embedding, label, data)) + + return idx + + def get_num_items(self): + return self.__conn.get("num_items") or 0 + + def batch_add( + self, + embeddings: Sequence[FloatTensor], + labels: Sequence[int] | None = None, + data: Sequence[Tensor] | None = None, + ) -> list[int]: + """Add a set of embedding records to the key value store. + + Args: + embeddings: Embeddings predicted by the model. + + labels: Class numerical ids. Defaults to None. + + data: Data associated with the embeddings. Defaults to None. + + See: + add() for what a record contains. + + Returns: + List of associated record id. + """ + idxs: list[int] = [] + for i, embedding in enumerate(embeddings): + label = None if labels is None else labels[i] + rec_data = None if data is None else data[i] + idx = self.add(embedding, label, rec_data) + idxs.append(idx) + + return idxs + + def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: + """Get an embedding record from the key value store. + + Args: + idx: Id of the record to fetch. + + Returns: + record associated with the requested id. + """ + + return self.__conn.get(str(idx)) + + def batch_get(self, idxs: Sequence[int]) -> tuple[list[FloatTensor], list[int | None], list[Tensor | None]]: + """Get embedding records from the key value store. + + Args: + idxs: ids of the records to fetch. + + Returns: + List of records associated with the requested ids. + """ + embeddings = [] + labels = [] + data = [] + for idx in idxs: + e, l, d = self.get(idx) + embeddings.append(e) + labels.append(l) + data.append(d) + return embeddings, labels, data + + def size(self) -> int: + "Number of record in the key value store." + return self.get_num_items() + + def __make_config_file_path(self, path): + return path / "config.json" + + def __save_config(self, path): + with open(self.__make_config_file_path(path), "wt") as f: + json.dump(self.get_config(), f) + + def __set_config(self, host, port, db): + self.host = host + self.port = port + self.db = db + + def __connect(self): + self.__conn = redis.Redis(host=self.host, port=self.port, db=self.db) + + def __load_config(self, path): + with open(self.__make_config_file_path(path), "rt") as f: + self.__set_config(**json.load(f)) + self.__connect() + + def save(self, path: str, compression: bool = True) -> None: + """Serializes index on disk. + + Args: + path: where to store the data. + compression: Compress index data. Defaults to True. + """ + # Writing to a buffer to avoid read error in np.savez when using GFile. + # See: https://github.com/tensorflow/tensorflow/issues/32090 + self.__save_config(path) + + def get_config(self): + return { + "host": self.host, + "port": self.port, + "db": self.db, + "num_items": self.get_num_items() + } + + def load(self, path: str) -> int: + """load index on disk + + Args: + path: which directory to use to store the index data. + + Returns: + Number of records reloaded. + """ + self.__load_config(path) + return self.size() + + def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: + """Export data as a Pandas dataframe. + + Cached store does not fit in memory, therefore we do not implement this. + + Args: + num_records: Number of records to export to the dataframe. + Defaults to 0 (unlimited). + + Returns: + None + """ + + return None From 26dbf150750f188e7ce9aad8ed01a98f47e90f1d Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 27 Feb 2023 14:05:11 -0800 Subject: [PATCH 02/35] formatting --- tensorflow_similarity/base_indexer.py | 63 ++- tensorflow_similarity/indexer.py | 3 +- tensorflow_similarity/search/faiss_search.py | 414 +++++++++--------- tensorflow_similarity/search/linear_search.py | 319 +++++++------- tensorflow_similarity/stores/cached_store.py | 43 +- tensorflow_similarity/stores/redis_store.py | 27 +- tests/search/test_faiss_search.py | 108 +++++ tests/search/test_linear_search.py | 101 +++++ tests/stores/test_cached_store.py | 68 +++ tests/stores/test_redis_store.py | 53 +++ 10 files changed, 754 insertions(+), 445 deletions(-) create mode 100644 tests/search/test_faiss_search.py create mode 100644 tests/search/test_linear_search.py create mode 100644 tests/stores/test_cached_store.py create mode 100644 tests/stores/test_redis_store.py diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index ffafcdfe..0ce32e82 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -21,10 +21,8 @@ from tabulate import tabulate - class BaseIndexer(ABC): - def __init__(self, distance, embedding_output, embedding_size, evaluator, - stat_buffer_size): + def __init__(self, distance, embedding_output, embedding_size, evaluator, stat_buffer_size): distance = distance_canonicalizer(distance) self.distance = distance # needed for save()/load() self.embedding_output = embedding_output @@ -44,7 +42,7 @@ def __init__(self, distance, embedding_output, embedding_size, evaluator, self.calibration_thresholds: Mapping[str, np.ndarray] = {} return - + # evaluation related functions def evaluate_retrieval( self, @@ -348,33 +346,33 @@ def match( @abstractmethod def add( - self, + self, prediction: FloatTensor, label: int | None = None, data: Tensor = None, build: bool = True, verbose: int = 1, ): - """Add a single embedding to the indexer + """Add a single embedding to the indexer + + Args: + prediction: TF similarity model prediction, may be a multi-headed + output. - Args: - prediction: TF similarity model prediction, may be a multi-headed - output. + label: Label(s) associated with the + embedding. Defaults to None. - label: Label(s) associated with the - embedding. Defaults to None. + data: Input data associated with + the embedding. Defaults to None. - data: Input data associated with - the embedding. Defaults to None. + build: Rebuild the index after insertion. + Defaults to True. Set it to false if you would like to add + multiples batches/points and build it manually once after. - build: Rebuild the index after insertion. - Defaults to True. Set it to false if you would like to add - multiples batches/points and build it manually once after. + verbose: Display progress if set to 1. + Defaults to 1. + """ - verbose: Display progress if set to 1. - Defaults to 1. - """ - @abstractmethod def batch_add( self, @@ -384,22 +382,22 @@ def batch_add( build: bool = True, verbose: int = 1, ): - """Add a batch of embeddings to the indexer + """Add a batch of embeddings to the indexer - Args: - predictions: TF similarity model predictions, may be a multi-headed - output. + Args: + predictions: TF similarity model predictions, may be a multi-headed + output. - labels: label(s) associated with the embedding. Defaults to None. + labels: label(s) associated with the embedding. Defaults to None. - datas: input data associated with the embedding. Defaults to None. + datas: input data associated with the embedding. Defaults to None. - build: Rebuild the index after insertion. - Defaults to True. Set it to false if you would like to add - multiples batches/points and build it manually once after. + build: Rebuild the index after insertion. + Defaults to True. Set it to false if you would like to add + multiples batches/points and build it manually once after. - verbose: Display progress if set to 1. Defaults to 1. - """ + verbose: Display progress if set to 1. Defaults to 1. + """ @abstractmethod def single_lookup(self, prediction: FloatTensor, k: int = 5) -> list[Lookup]: @@ -414,8 +412,7 @@ def single_lookup(self, prediction: FloatTensor, k: int = 5) -> list[Lookup]: list of the k nearest neighbors info: list[Lookup] """ - - + @abstractmethod def batch_lookup(self, predictions: FloatTensor, k: int = 5, verbose: int = 1) -> list[list[Lookup]]: diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index f2e0518c..a16654ab 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -116,8 +116,7 @@ def __init__( Raises: ValueError: Invalid search framework or key value store. """ - super().__init__(distance, embedding_output, embedding_size, evaluator, - stat_buffer_size) + super().__init__(distance, embedding_output, embedding_size, evaluator, stat_buffer_size) # internal structure naming # FIXME support custom objects self.search_type = search diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index e4ac89b1..754e740f 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -12,216 +12,210 @@ class FaissSearch(Search): - """This class implements the Faiss ANN interface. - - It implements the Search interface. - """ - - def __init__( - self, - distance: Distance | str, - dim: int, - verbose: int = 0, - name: str | None = None, - algo="ivfpq", - m=8, - nbits=8, - nlist=1024, - nprobe=1, - normalize=True, - ): - """Initiate FAISS indexer - - Args: - d: number of dimensions - m: number of centroid IDs in final compressed vectors. d must be divisible - by m - nbits: number of bits in each centroid - nlist: how many Voronoi cells (must be greater than or equal to 2**nbits) - nprobe: how many of the nearest cells to include in search - """ - super().__init__(distance=distance, dim=dim, verbose=verbose, name=name) - self.algo = algo - self.m = m # number of bits per subquantizer - self.nbits = nbits - self.nlist = nlist - self.nprobe = nprobe - self.normalize = normalize - self.built = False - - if verbose: - t_msg = [ - "\n|-Initialize NMSLib Index", - f"| - algo: {self.algo}", - f"| - m: {self.m}", - f"| - nbits: {self.nbits}", - f"| - nlist: {self.nlist}", - f"| - nprobe: {self.nprobe}", - f"| - normalize: {self.normalize}", - f"| - query_params: {self.query_params}", - ] - cprint("\n".join(t_msg) + "\n", "green") - - if self.algo == "ivfpq": - assert dim % m == 0, f"dim={dim}, m={m}" - if self.algo == "ivfpq": - metric = faiss.METRIC_L2 - prefix = "" - if distance == "cosine": - prefix = "L2norm," - metric = faiss.METRIC_INNER_PRODUCT - # this distance requires both the input and query vectors to be normalized - ivf_string = f"IVF{nlist}," - pq_string = f"PQ{m}x{nbits}" - factory_string = prefix + ivf_string + pq_string - self.index = faiss.index_factory(dim, factory_string, metric) - # quantizer = faiss.IndexFlatIP( - # dim - # ) # we keep the same L2 distance flat index - # self.index = faiss.IndexIVFPQ( - # quantizer, dim, nlist, m, nbits, metric=faiss.METRIC_INNER_PRODUCT - # ) - # else: - # quantizer = faiss.IndexFlatL2( - # dim - # ) # we keep the same L2 distance flat index - # self.index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, nbits) - self.index.nprobe = nprobe # set how many of nearest cells to search - elif algo == "flat": - if distance == "cosine": - # this is exact match using cosine/dot-product Distance - self.index = faiss.IndexFlatIP(dim) - else: - # this is exact match using L2 distance - self.index = faiss.IndexFlatL2(dim) - - def is_built(self): - return self.built - - def needs_building(self): - if self.algo == "flat": - return False - else: - return not self.index.is_trained - - def build_index(self, samples, **kwargss): - if self.algo == "ivfpq": - if self.normalize: - faiss.normalize_L2(samples) - self.index.train(samples) # we must train the index to cluster into cells - self.built = True - - def batch_lookup( - self, embeddings: FloatTensor, k: int = 5 - ) -> tuple[list[list[int]], list[list[float]]]: - """Find embeddings K nearest neighboors embeddings. - - Args: - embedding: Batch of query embeddings as predicted by the model. - k: Number of nearest neighboors embedding to lookup. Defaults to 5. - """ - - if self.normalize: - faiss.normalize_L2(embeddings) - D, I = self.index.search(embeddings, k) - return I, D - - def lookup( - self, embedding: FloatTensor, k: int = 5 - ) -> tuple[list[int], list[float]]: - """Find embedding K nearest neighboors embeddings. + """This class implements the Faiss ANN interface. - Args: - embedding: Query embedding as predicted by the model. - k: Number of nearest neighboors embedding to lookup. Defaults to 5. - """ - int_embedding = np.array([embedding], dtype=np.float32) - if self.normalize: - faiss.normalize_L2(int_embedding) - D, I = self.index.search(int_embedding, k) - return I[0], D[0] - - def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): - """Add a single embedding to the search index. - - Args: - embedding: The embedding to index as computed by the similarity model. - idx: Embedding id as in the index table. Returned with the embedding to - allow to lookup the data associated with a given embedding. - """ - int_embedding = np.array([embedding], dtype=np.float32) - if self.normalize: - faiss.normalize_L2(int_embedding) - if self.algo != "flat": - self.index.add_with_ids(int_embedding) - else: - self.index.add(int_embedding) - - def batch_add( - self, - embeddings: FloatTensor, - idxs: Sequence[int], - verbose: int = 1, - normalize: bool = True, - **kwargs, - ): - """Add a batch of embeddings to the search index. - - Args: - embeddings: List of embeddings to add to the index. - idxs (int): Embedding ids as in the index table. Returned with the - embeddings to allow to lookup the data associated with the returned - embeddings. - verbose: Be verbose. Defaults to 1. - """ - if self.normalize: - faiss.normalize_L2(embeddings) - if self.algo != "flat": - # flat does not accept indexes as parameters and assumes incremental - # indexes - self.index.add_with_ids(embeddings, idxs) - else: - self.index.add(embeddings) - - def save(self, path: str): - """Serializes the index data on disk - - Args: - path: where to store the data + It implements the Search interface. """ - chunk = faiss.serialize_index(self.index) - np.save(self.__make_fname(path), chunk) - def __make_fname(self, path): - return str(Path(path) / "faiss_index.npy") - - def load(self, path: str): - """load index on disk - - Args: - path: where to store the data - """ - self.index = faiss.deserialize_index( - np.load(self.__make_fname(path)) - ) # identical to index - - def get_config(self) -> dict[str, Any]: - """Contains the search configuration. - - Returns: - A Python dict containing the configuration of the search obj. - """ - config = { - "distance": self.distance.name, - "dim": self.dim, - "algo": self.algo, - "m": self.m, - "nlist": self.nlist, - "nprobe": self.nprobe, - "normalize": self.normalize, - "verbose": self.verbose, - "name": self.name, - "canonical_name": self.__class__.__name__, - } - - return config + def __init__( + self, + distance: Distance | str, + dim: int, + verbose: int = 0, + name: str | None = None, + algo="ivfpq", + m=8, + nbits=8, + nlist=1024, + nprobe=1, + normalize=True, + ): + """Initiate FAISS indexer + + Args: + d: number of dimensions + m: number of centroid IDs in final compressed vectors. d must be divisible + by m + nbits: number of bits in each centroid + nlist: how many Voronoi cells (must be greater than or equal to 2**nbits) + nprobe: how many of the nearest cells to include in search + """ + super().__init__(distance=distance, dim=dim, verbose=verbose, name=name) + self.algo = algo + self.m = m # number of bits per subquantizer + self.nbits = nbits + self.nlist = nlist + self.nprobe = nprobe + self.normalize = normalize + self.built = False + + if verbose: + t_msg = [ + "\n|-Initialize NMSLib Index", + f"| - algo: {self.algo}", + f"| - m: {self.m}", + f"| - nbits: {self.nbits}", + f"| - nlist: {self.nlist}", + f"| - nprobe: {self.nprobe}", + f"| - normalize: {self.normalize}", + f"| - query_params: {self.query_params}", + ] + cprint("\n".join(t_msg) + "\n", "green") + + if self.algo == "ivfpq": + assert dim % m == 0, f"dim={dim}, m={m}" + if self.algo == "ivfpq": + metric = faiss.METRIC_L2 + prefix = "" + if distance == "cosine": + prefix = "L2norm," + metric = faiss.METRIC_INNER_PRODUCT + # this distance requires both the input and query vectors to be normalized + ivf_string = f"IVF{nlist}," + pq_string = f"PQ{m}x{nbits}" + factory_string = prefix + ivf_string + pq_string + self.index = faiss.index_factory(dim, factory_string, metric) + # quantizer = faiss.IndexFlatIP( + # dim + # ) # we keep the same L2 distance flat index + # self.index = faiss.IndexIVFPQ( + # quantizer, dim, nlist, m, nbits, metric=faiss.METRIC_INNER_PRODUCT + # ) + # else: + # quantizer = faiss.IndexFlatL2( + # dim + # ) # we keep the same L2 distance flat index + # self.index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, nbits) + self.index.nprobe = nprobe # set how many of nearest cells to search + elif algo == "flat": + if distance == "cosine": + # this is exact match using cosine/dot-product Distance + self.index = faiss.IndexFlatIP(dim) + else: + # this is exact match using L2 distance + self.index = faiss.IndexFlatL2(dim) + + def is_built(self): + return self.built + + def needs_building(self): + if self.algo == "flat": + return False + else: + return not self.index.is_trained + + def build_index(self, samples, **kwargss): + if self.algo == "ivfpq": + if self.normalize: + faiss.normalize_L2(samples) + self.index.train(samples) # we must train the index to cluster into cells + self.built = True + + def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[int]], list[list[float]]]: + """Find embeddings K nearest neighboors embeddings. + + Args: + embedding: Batch of query embeddings as predicted by the model. + k: Number of nearest neighboors embedding to lookup. Defaults to 5. + """ + + if self.normalize: + faiss.normalize_L2(embeddings) + D, I = self.index.search(embeddings, k) + return I, D + + def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[float]]: + """Find embedding K nearest neighboors embeddings. + + Args: + embedding: Query embedding as predicted by the model. + k: Number of nearest neighboors embedding to lookup. Defaults to 5. + """ + int_embedding = np.array([embedding], dtype=np.float32) + if self.normalize: + faiss.normalize_L2(int_embedding) + D, I = self.index.search(int_embedding, k) + return I[0], D[0] + + def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): + """Add a single embedding to the search index. + + Args: + embedding: The embedding to index as computed by the similarity model. + idx: Embedding id as in the index table. Returned with the embedding to + allow to lookup the data associated with a given embedding. + """ + int_embedding = np.array([embedding], dtype=np.float32) + if self.normalize: + faiss.normalize_L2(int_embedding) + if self.algo != "flat": + self.index.add_with_ids(int_embedding) + else: + self.index.add(int_embedding) + + def batch_add( + self, + embeddings: FloatTensor, + idxs: Sequence[int], + verbose: int = 1, + normalize: bool = True, + **kwargs, + ): + """Add a batch of embeddings to the search index. + + Args: + embeddings: List of embeddings to add to the index. + idxs (int): Embedding ids as in the index table. Returned with the + embeddings to allow to lookup the data associated with the returned + embeddings. + verbose: Be verbose. Defaults to 1. + """ + if self.normalize: + faiss.normalize_L2(embeddings) + if self.algo != "flat": + # flat does not accept indexes as parameters and assumes incremental + # indexes + self.index.add_with_ids(embeddings, idxs) + else: + self.index.add(embeddings) + + def save(self, path: str): + """Serializes the index data on disk + + Args: + path: where to store the data + """ + chunk = faiss.serialize_index(self.index) + np.save(self.__make_fname(path), chunk) + + def __make_fname(self, path): + return str(Path(path) / "faiss_index.npy") + + def load(self, path: str): + """load index on disk + + Args: + path: where to store the data + """ + self.index = faiss.deserialize_index(np.load(self.__make_fname(path))) # identical to index + + def get_config(self) -> dict[str, Any]: + """Contains the search configuration. + + Returns: + A Python dict containing the configuration of the search obj. + """ + config = { + "distance": self.distance.name, + "dim": self.dim, + "algo": self.algo, + "m": self.m, + "nlist": self.nlist, + "nprobe": self.nprobe, + "normalize": self.normalize, + "verbose": self.verbose, + "name": self.name, + "canonical_name": self.__class__.__name__, + } + + return config diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index 65cf536f..7cbac45e 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -17,167 +17,164 @@ class LinearSearch(Search): - """This class implements the Linear Search interface. - - It implements the Search interface. - """ - - def __init__( - self, - distance: Distance | str, - dim: int, - verbose: int = 0, - name: str | None = None, - ): - """Initiate Linear indexer. - - Args: - d: number of dimensions - m: number of centroid IDs in final compressed vectors. d must be divisible - by m - nbits: number of bits in each centroid - nlist: how many Voronoi cells (must be greater than or equal to 2**nbits) - nprobe: how many of the nearest cells to include in search - """ - super().__init__(distance=distance, dim=dim, verbose=verbose, name=name) - - if verbose: - t_msg = [ - "\n|-Initialize NMSLib Index", - f"| - distance: {self.distance}", - f"| - dim: {self.dim}", - f"| - verbose: {self.verbose}", - f"| - name: {self.name}", - ] - cprint("\n".join(t_msg) + "\n", "green") - self.db = np.empty((INITIAL_DB_SIZE, dim), dtype=np.float32) - self.ids = [] - - - - def is_built(self): - return True - - def needs_building(self): - return False - - def batch_lookup( - self, embeddings: FloatTensor, k: int = 5 - ) -> tuple[list[list[int]], list[list[float]]]: - """Find embeddings K nearest neighboors embeddings. - - Args: - embedding: Batch of query embeddings as predicted by the model. - k: Number of nearest neighboors embedding to lookup. Defaults to 5. - """ + """This class implements the Linear Search interface. - normalized_query = tf.math.l2_normalize(embeddings, axis=1) - items = len(self.ids) - sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) - similarity, id_idxs = tf.math.top_k(sims, k) - ids_array = np.array(self.ids) - return np.array([ids_array[x.numpy()] for x in id_idxs]), similarity - - def lookup( - self, embedding: FloatTensor, k: int = 5 - ) -> tuple[list[int], list[float]]: - """Find embedding K nearest neighboors embeddings. - - Args: - embedding: Query embedding as predicted by the model. - k: Number of nearest neighboors embedding to lookup. Defaults to 5. - """ - normalized_query = tf.math.l2_normalize(np.array([embedding], dtype=np.float32), axis=1) - items = len(self.ids) - sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) - similarity, id_idxs = tf.math.top_k(sims, k) - ids_array = np.array(self.ids) - return np.array(ids_array[id_idxs[0].numpy()]), similarity[0] - - def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): - """Add a single embedding to the search index. - - Args: - embedding: The embedding to index as computed by the similarity model. - idx: Embedding id as in the index table. Returned with the embedding to - allow to lookup the data associated with a given embedding. - """ - int_embedding = tf.math.l2_normalize(np.array([embedding], dtype=np.float32), axis=1) - items = len(self.ids) - if items + 1 > self.db.shape[0]: - # it's full - new_db = np.empty((len(self.ids) + DB_SIZE_STEPS, self.dim), dtype=np.float32) - new_db[:items] = self.db - self.db = new_db - self.ids.append(idx) - self.db[items] = int_embedding - - def batch_add( - self, - embeddings: FloatTensor, - idxs: Sequence[int], - verbose: int = 1, - normalize: bool = True, - **kwargs, - ): - """Add a batch of embeddings to the search index. - - Args: - embeddings: List of embeddings to add to the index. - idxs (int): Embedding ids as in the index table. Returned with the - embeddings to allow to lookup the data associated with the returned - embeddings. - verbose: Be verbose. Defaults to 1. - """ - int_embeddings = tf.math.l2_normalize(embeddings, axis=1) - items = len(self.ids) - if items + len(embeddings) > self.db.shape[0]: - # it's full - new_db = np.empty((((items + len(embeddings) + DB_SIZE_STEPS) // DB_SIZE_STEPS) * DB_SIZE_STEPS, self.dim), dtype=np.float32) - new_db[:items] = self.db - self.db = new_db - self.ids.extend(idxs) - self.db[items:items+len(embeddings)] = int_embeddings - - def __make_file_path(self, path): - return path / "index.pickle" - - def save(self, path: str): - """Serializes the index data on disk - - Args: - path: where to store the data - """ - with open(self.__make_file_path(path), "wb") as f: - pickle.dump((self.db, self.ids), f) - - def load(self, path: str): - """load index on disk - - Args: - path: where to store the data - """ - with open(self.__make_file_path(path), "rb") as f: - data = pickle.load(f) - self.db = data[0] - self.ids = data[1] - - def __make_config_path(self, path): - return path / "config.json" - - def __save_config(self): - with open(self.__make_config_file_path(path), "wt") as f: - json.dump(self.get_config(), f) - - def get_config(self) -> dict[str, Any]: - """Contains the search configuration. - - Returns: - A Python dict containing the configuration of the search obj. + It implements the Search interface. """ - config = { - "distance": self.distance.name, - "dim": self.dim, - } - return config + def __init__( + self, + distance: Distance | str, + dim: int, + verbose: int = 0, + name: str | None = None, + ): + """Initiate Linear indexer. + + Args: + d: number of dimensions + m: number of centroid IDs in final compressed vectors. d must be divisible + by m + nbits: number of bits in each centroid + nlist: how many Voronoi cells (must be greater than or equal to 2**nbits) + nprobe: how many of the nearest cells to include in search + """ + super().__init__(distance=distance, dim=dim, verbose=verbose, name=name) + + if verbose: + t_msg = [ + "\n|-Initialize NMSLib Index", + f"| - distance: {self.distance}", + f"| - dim: {self.dim}", + f"| - verbose: {self.verbose}", + f"| - name: {self.name}", + ] + cprint("\n".join(t_msg) + "\n", "green") + self.db = np.empty((INITIAL_DB_SIZE, dim), dtype=np.float32) + self.ids = [] + + def is_built(self): + return True + + def needs_building(self): + return False + + def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[int]], list[list[float]]]: + """Find embeddings K nearest neighboors embeddings. + + Args: + embedding: Batch of query embeddings as predicted by the model. + k: Number of nearest neighboors embedding to lookup. Defaults to 5. + """ + + normalized_query = tf.math.l2_normalize(embeddings, axis=1) + items = len(self.ids) + sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) + similarity, id_idxs = tf.math.top_k(sims, k) + ids_array = np.array(self.ids) + return np.array([ids_array[x.numpy()] for x in id_idxs]), similarity + + def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[float]]: + """Find embedding K nearest neighboors embeddings. + + Args: + embedding: Query embedding as predicted by the model. + k: Number of nearest neighboors embedding to lookup. Defaults to 5. + """ + normalized_query = tf.math.l2_normalize(np.array([embedding], dtype=np.float32), axis=1) + items = len(self.ids) + sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) + similarity, id_idxs = tf.math.top_k(sims, k) + ids_array = np.array(self.ids) + return np.array(ids_array[id_idxs[0].numpy()]), similarity[0] + + def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): + """Add a single embedding to the search index. + + Args: + embedding: The embedding to index as computed by the similarity model. + idx: Embedding id as in the index table. Returned with the embedding to + allow to lookup the data associated with a given embedding. + """ + int_embedding = tf.math.l2_normalize(np.array([embedding], dtype=np.float32), axis=1) + items = len(self.ids) + if items + 1 > self.db.shape[0]: + # it's full + new_db = np.empty((len(self.ids) + DB_SIZE_STEPS, self.dim), dtype=np.float32) + new_db[:items] = self.db + self.db = new_db + self.ids.append(idx) + self.db[items] = int_embedding + + def batch_add( + self, + embeddings: FloatTensor, + idxs: Sequence[int], + verbose: int = 1, + normalize: bool = True, + **kwargs, + ): + """Add a batch of embeddings to the search index. + + Args: + embeddings: List of embeddings to add to the index. + idxs (int): Embedding ids as in the index table. Returned with the + embeddings to allow to lookup the data associated with the returned + embeddings. + verbose: Be verbose. Defaults to 1. + """ + int_embeddings = tf.math.l2_normalize(embeddings, axis=1) + items = len(self.ids) + if items + len(embeddings) > self.db.shape[0]: + # it's full + new_db = np.empty( + (((items + len(embeddings) + DB_SIZE_STEPS) // DB_SIZE_STEPS) * DB_SIZE_STEPS, self.dim), + dtype=np.float32, + ) + new_db[:items] = self.db + self.db = new_db + self.ids.extend(idxs) + self.db[items : items + len(embeddings)] = int_embeddings + + def __make_file_path(self, path): + return path / "index.pickle" + + def save(self, path: str): + """Serializes the index data on disk + + Args: + path: where to store the data + """ + with open(self.__make_file_path(path), "wb") as f: + pickle.dump((self.db, self.ids), f) + + def load(self, path: str): + """load index on disk + + Args: + path: where to store the data + """ + with open(self.__make_file_path(path), "rb") as f: + data = pickle.load(f) + self.db = data[0] + self.ids = data[1] + + def __make_config_path(self, path): + return path / "config.json" + + def __save_config(self): + with open(self.__make_config_file_path(path), "wt") as f: + json.dump(self.get_config(), f) + + def get_config(self) -> dict[str, Any]: + """Contains the search configuration. + + Returns: + A Python dict containing the configuration of the search obj. + """ + config = { + "distance": self.distance.name, + "dim": self.dim, + } + + return config diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index 1cfdc55b..b17f3564 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -41,20 +41,20 @@ def __init__(self, shard_size=1000000) -> None: self.shard_size = shard_size self.num_items: int = 0 self.path: str = "." - + def __get_shard_file_path(self, shard_no): - return f'{self.path}/cache{shard_no}' - + return f"{self.path}/cache{shard_no}" + def __make_new_shard(self, shard_no: int): - return dbm.open(self.__get_shard_file_path(shard_no), 'c') - + return dbm.open(self.__get_shard_file_path(shard_no), "c") + def __add_new_shard(self): - shard_no = len(self.db) - self.db.append(self.__make_new_shard(shard_no)) + shard_no = len(self.db) + self.db.append(self.__make_new_shard(shard_no)) def __reopen_all_shards(self): for shard_no in range(len(self.db)): - self.db[shard_no] = self.__make_new_shard(shard_no) + self.db[shard_no] = self.__make_new_shard(shard_no) def add( self, @@ -110,10 +110,10 @@ def batch_add( rec_data = None if data is None else data[i] shard_no = idx // self.shard_size if len(self.db) <= shard_no: - self.__add_new_shard() + self.__add_new_shard() self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, rec_data)) idxs.append(idx) - + return idxs def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: @@ -156,22 +156,22 @@ def size(self) -> int: def __close_all_shards(self): for shard in self.db: shard.close() - + def __copy_shards(self, path): for shard_no in range(len(self.db)): - shutil.copy(Path(self.__get_shard_file_path(shard_no)).with_suffix('.db'), path) - + shutil.copy(Path(self.__get_shard_file_path(shard_no)).with_suffix(".db"), path) + def __make_config_file_path(self, path): - return path / "config.json" - + return path / "config.json" + def __save_config(self, path): with open(self.__make_config_file_path(path), "wt") as f: json.dump(self.get_config(), f) - + def __set_config(self, num_items, shard_size): self.num_items = num_items self.shard_size = shard_size - + def __load_config(self, path): with open(self.__make_config_file_path(path), "rt") as f: self.__set_config(**json.load(f)) @@ -191,11 +191,8 @@ def save(self, path: str, compression: bool = True) -> None: self.__reopen_all_shards() def get_config(self): - return { - "shard_size": self.shard_size, - "num_items": self.num_items - } - + return {"shard_size": self.shard_size, "num_items": self.num_items} + def load(self, path: str) -> int: """load index on disk @@ -214,7 +211,7 @@ def load(self, path: str) -> int: def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: """Export data as a Pandas dataframe. - + Cached store does not fit in memory, therefore we do not implement this. Args: diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py index 2a51d55f..14f57632 100644 --- a/tensorflow_similarity/stores/redis_store.py +++ b/tensorflow_similarity/stores/redis_store.py @@ -31,7 +31,7 @@ def __init__(self, host="localhost", port=6379, db=0) -> None: self.port = port self.db = db self.__connect() - + def add( self, embedding: FloatTensor, @@ -53,9 +53,9 @@ def add( num_items = self.__conn.incr("num_items") idx = num_items - 1 self.__conn.set(idx, (embedding, label, data)) - + return idx - + def get_num_items(self): return self.__conn.get("num_items") or 0 @@ -86,7 +86,7 @@ def batch_add( rec_data = None if data is None else data[i] idx = self.add(embedding, label, rec_data) idxs.append(idx) - + return idxs def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: @@ -125,17 +125,17 @@ def size(self) -> int: return self.get_num_items() def __make_config_file_path(self, path): - return path / "config.json" - + return path / "config.json" + def __save_config(self, path): with open(self.__make_config_file_path(path), "wt") as f: json.dump(self.get_config(), f) - + def __set_config(self, host, port, db): self.host = host self.port = port self.db = db - + def __connect(self): self.__conn = redis.Redis(host=self.host, port=self.port, db=self.db) @@ -156,13 +156,8 @@ def save(self, path: str, compression: bool = True) -> None: self.__save_config(path) def get_config(self): - return { - "host": self.host, - "port": self.port, - "db": self.db, - "num_items": self.get_num_items() - } - + return {"host": self.host, "port": self.port, "db": self.db, "num_items": self.get_num_items()} + def load(self, path: str) -> int: """load index on disk @@ -177,7 +172,7 @@ def load(self, path: str) -> int: def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: """Export data as a Pandas dataframe. - + Cached store does not fit in memory, therefore we do not implement this. Args: diff --git a/tests/search/test_faiss_search.py b/tests/search/test_faiss_search.py new file mode 100644 index 00000000..1963f78c --- /dev/null +++ b/tests/search/test_faiss_search.py @@ -0,0 +1,108 @@ +import numpy as np + +from tensorflow_similarity.search import FaissSearch + + +def test_index_match(): + target = np.array([1, 1, 2], dtype="float32") + embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") + + search_index = FaissSearch("cosine", 3, algo="flat") + search_index.add(embs[0], 0) + search_index.add(embs[1], 1) + + idxs, embs = search_index.lookup(target, k=2) + print(f"idxs={idxs}, embs={embs}") + + assert len(embs) == 2 + assert list(idxs) == [0, 1] + + +def test_index_save(tmp_path): + target = np.array([1, 1, 2], dtype="float32") + embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") + k = 2 + + search_index = FaissSearch("cosine", 3, algo="flat") + search_index.add(embs[0], 0) + search_index.add(embs[1], 1) + + idxs, embs = search_index.lookup(target, k=k) + print(f"idxs={idxs}, embs={embs}") + + assert len(embs) == k + assert list(idxs) == [0, 1] + + search_index.save(tmp_path) + + search_index2 = FaissSearch("cosine", 3, algo="flat") + search_index2.load(tmp_path) + + idxs2, embs2 = search_index.lookup(target, k=k) + print(f"idxs2={idxs2}, embs2={embs2}") + assert len(embs2) == k + assert list(idxs2) == [0, 1] + + # add more + # if the dtype is not passed we get an incompatible type error + search_index2.add(np.array([3.0, 3.0, 3.0], dtype="float32"), 3) + idxs3, embs3 = search_index2.lookup(target, k=3) + print(f"idxs3={idxs3}, embs3={embs3}") + assert len(embs3) == 3 + assert list(idxs3) == [0, 2, 1] + + +def test_batch_vs_single(tmp_path): + num_targets = 10 + index_size = 100 + vect_dim = 16 + + # gen + idxs = list(range(index_size)) + + targets = np.random.random((num_targets, vect_dim)).astype("float32") + embs = np.random.random((index_size, vect_dim)).astype("float32") + + # build search_index + search_index = FaissSearch("cosine", vect_dim, algo="flat") + search_index.batch_add(embs, idxs) + + # batch + batch_idxs, _ = search_index.batch_lookup(targets) + + # single + singles_idxs = [] + for t in targets: + idxs, embs = search_index.lookup(t) + singles_idxs.append(idxs) + + for i in range(num_targets): + # k neigboors are the same? + for k in range(3): + assert batch_idxs[i][k] == singles_idxs[i][k] + + +def test_ivfpq(): + # test ivfpq ANN indexing with 100M entries + num_targets = 10 + index_size = 10000 + vect_dim = 16 + + # gen + idxs = np.array(list(range(index_size))) + + targets = np.random.random((num_targets, vect_dim)).astype("float32") + embs = np.random.random((index_size, vect_dim)).astype("float32") + + search_index = FaissSearch("cosine", vect_dim, algo="ivfpq") + assert search_index.is_built() == False + search_index.build_index(embs) + assert search_index.is_built() == True + last_idx = 0 + for i in range(1000): + idxs = np.array(list(range(last_idx, last_idx + index_size))) + embs = np.random.random((index_size, vect_dim)).astype("float32") + last_idx += index_size + search_index.batch_add(embs, idxs) + found_idxs, found_dists = search_index.batch_lookup(targets, 2) + assert found_idxs.shape == (10, 2) diff --git a/tests/search/test_linear_search.py b/tests/search/test_linear_search.py new file mode 100644 index 00000000..bad1bbe3 --- /dev/null +++ b/tests/search/test_linear_search.py @@ -0,0 +1,101 @@ +import numpy as np + +from tensorflow_similarity.search import LinearSearch + + +def test_index_match(): + target = np.array([1, 1, 2], dtype="float32") + embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") + + search_index = LinearSearch("cosine", 3) + search_index.add(embs[0], 0) + search_index.add(embs[1], 1) + + idxs, embs = search_index.lookup(target, k=2) + + assert len(embs) == 2 + assert list(idxs) == [0, 1] + + +def test_index_save(tmp_path): + target = np.array([1, 1, 2], dtype="float32") + embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") + k = 2 + + search_index = LinearSearch("cosine", 3) + search_index.add(embs[0], 0) + search_index.add(embs[1], 1) + + idxs, embs = search_index.lookup(target, k=k) + + assert len(embs) == k + assert list(idxs) == [0, 1] + + search_index.save(tmp_path) + + search_index2 = LinearSearch("cosine", 3) + search_index2.load(tmp_path) + + idxs2, embs2 = search_index.lookup(target, k=k) + assert len(embs2) == k + assert list(idxs2) == [0, 1] + + # add more + # if the dtype is not passed we get an incompatible type error + search_index2.add(np.array([3.0, 3.0, 3.0], dtype="float32"), 3) + idxs3, embs3 = search_index2.lookup(target, k=3) + assert len(embs3) == 3 + assert list(idxs3) == [0, 3, 1] + + +def test_batch_vs_single(tmp_path): + num_targets = 10 + index_size = 100 + vect_dim = 16 + + # gen + idxs = list(range(index_size)) + + targets = np.random.random((num_targets, vect_dim)).astype("float32") + embs = np.random.random((index_size, vect_dim)).astype("float32") + + # build search_index + search_index = LinearSearch("cosine", vect_dim) + search_index.batch_add(embs, idxs) + + # batch + batch_idxs, _ = search_index.batch_lookup(targets) + + # single + singles_idxs = [] + for t in targets: + idxs, embs = search_index.lookup(t) + singles_idxs.append(idxs) + + for i in range(num_targets): + # k neigboors are the same? + for k in range(3): + assert batch_idxs[i][k] == singles_idxs[i][k] + + +def test_running_larger_batches(): + num_targets = 10 + index_size = 1000 + vect_dim = 16 + + # gen + idxs = np.array(list(range(index_size))) + + targets = np.random.random((num_targets, vect_dim)).astype("float32") + embs = np.random.random((index_size, vect_dim)).astype("float32") + + search_index = LinearSearch("cosine", vect_dim) + assert search_index.is_built() == True + last_idx = 0 + for i in range(1000): + idxs = np.array(list(range(last_idx, last_idx + index_size))) + embs = np.random.random((index_size, vect_dim)).astype("float32") + last_idx += index_size + search_index.batch_add(embs, idxs) + found_idxs, found_dists = search_index.batch_lookup(targets, 2) + assert found_idxs.shape == (10, 2) diff --git a/tests/stores/test_cached_store.py b/tests/stores/test_cached_store.py new file mode 100644 index 00000000..036b8c1f --- /dev/null +++ b/tests/stores/test_cached_store.py @@ -0,0 +1,68 @@ +import numpy as np + +from tensorflow_similarity.stores import CachedStore + + +def build_store(records): + kv_store = CachedStore() + idxs = [] + for r in records: + idx = kv_store.add(r[0], r[1], r[2]) + idxs.append(idx) + return kv_store, idxs + + +def test_cached_store_and_retrieve(): + records = [[[0.1, 0.2], 1, [0, 0, 0]], [[0.2, 0.3], 2, [0, 0, 0]]] + + kv_store, idxs = build_store(records) + + # check index numbering + for gt, idx in enumerate(idxs): + assert isinstance(idx, int) + assert gt == idx + + # check reference counting + assert kv_store.size() == 2 + + # get back three elements + for idx in idxs: + emb, lbl, dt = kv_store.get(idx) + assert emb == records[idx][0] + assert lbl == records[idx][1] + assert dt == records[idx][2] + + +def test_batch_add(): + embs = np.array([[0.1, 0.2], [0.2, 0.3]]) + lbls = np.array([1, 2]) + data = np.array([[0, 0, 0], [1, 1, 1]]) + + kv_store = CachedStore() + idxs = kv_store.batch_add(embs, lbls, data) + for idx in idxs: + emb, lbl, dt = kv_store.get(idx) + assert np.array_equal(emb, embs[idx]) + assert np.array_equal(lbl, lbls[idx]) + assert np.array_equal(dt, data[idx]) + + +def test_save_and_reload(tmp_path): + records = [[[0.1, 0.2], 1, [0, 0, 0]], [[0.2, 0.3], 2, [0, 0, 0]]] + + kv_store, idxs = build_store(records) + kv_store.save(tmp_path) + + # reload + reloaded_store = CachedStore() + print(f"loading from {tmp_path}") + reloaded_store.load(tmp_path) + + assert reloaded_store.size() == 2 + + # get back three elements + for idx in idxs: + emb, lbl, dt = reloaded_store.get(idx) + assert np.array_equal(emb, records[idx][0]) + assert np.array_equal(lbl, records[idx][1]) + assert np.array_equal(dt, records[idx][2]) diff --git a/tests/stores/test_redis_store.py b/tests/stores/test_redis_store.py new file mode 100644 index 00000000..d739ebde --- /dev/null +++ b/tests/stores/test_redis_store.py @@ -0,0 +1,53 @@ +from unittest.mock import MagicMock +from unittest.mock import patch + +import numpy as np +from tensorflow_similarity.stores import RedisStore + + +def build_store(records): + kv_store = RedisStore() + idxs = [] + for r in records: + idx = kv_store.add(r[0], r[1], r[2]) + idxs.append(idx) + return kv_store, idxs + + +@patch("redis.Redis", return_value=MagicMock()) +def test_store_and_retrieve(mock_redis): + records = [[[0.1, 0.2], 1, [0, 0, 0]], [[0.2, 0.3], 2, [0, 0, 0]]] + mock_redis.return_value.get.side_effect = records + mock_redis.return_value.incr.side_effect = [1, 2, 3, 4, 5] + + kv_store, idxs = build_store(records) + + # check index numbering + for gt, idx in enumerate(idxs): + assert isinstance(idx, int) + assert gt == idx + + # get back three elements + for idx in idxs: + emb, lbl, dt = kv_store.get(idx) + assert emb == records[idx][0] + assert lbl == records[idx][1] + assert dt == records[idx][2] + + +@patch("redis.Redis", return_value=MagicMock()) +def test_batch_add(mock_redis): + embs = np.array([[0.1, 0.2], [0.2, 0.3]]) + lbls = np.array([1, 2]) + data = np.array([[0, 0, 0], [1, 1, 1]]) + + mock_redis.return_value.get.side_effect = [[embs[i], lbls[i], data[i]] for i in range(2)] + mock_redis.return_value.incr.side_effect = [1, 2, 3, 4, 5] + + kv_store = RedisStore() + idxs = kv_store.batch_add(embs, lbls, data) + for idx in idxs: + emb, lbl, dt = kv_store.get(idx) + assert np.array_equal(emb, embs[idx]) + assert np.array_equal(lbl, lbls[idx]) + assert np.array_equal(dt, data[idx]) From e201ac0f64e90d58783061fde7a66bb2457ba542 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 27 Feb 2023 14:25:28 -0800 Subject: [PATCH 03/35] formatting and fixing couple of issues --- tensorflow_similarity/base_indexer.py | 10 +++--- tensorflow_similarity/indexer.py | 32 +++++++------------ tensorflow_similarity/search/faiss_search.py | 10 +++--- tensorflow_similarity/search/linear_search.py | 4 +-- tensorflow_similarity/stores/cached_store.py | 6 +--- tensorflow_similarity/stores/redis_store.py | 1 + 6 files changed, 25 insertions(+), 38 deletions(-) diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index 0ce32e82..4ef65861 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -1,15 +1,14 @@ from abc import ABC, abstractmethod import numpy as np import tensorflow as tf -from .types import CalibrationResults, FloatTensor, Lookup, PandasDataFrame, Tensor +from .types import CalibrationResults, FloatTensor, Lookup, Tensor from collections.abc import Mapping, MutableMapping, Sequence from .retrieval_metrics import RetrievalMetric -from .distances import Distance, distance_canonicalizer -from .evaluators import Evaluator, MemoryEvaluator +from .distances import distance_canonicalizer from .matchers import ClassificationMatch, make_classification_matcher -from .retrieval_metrics import RetrievalMetric from .utils import unpack_lookup_distances, unpack_lookup_labels -from collections import defaultdict, deque +from collections import defaultdict +from tqdm.auto import tqdm from .classification_metrics import ( @@ -17,7 +16,6 @@ F1Score, make_classification_metric, ) -from .matchers import ClassificationMatch, make_classification_matcher from tabulate import tabulate diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index a16654ab..c6755b62 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -18,17 +18,13 @@ import json from collections import defaultdict, deque -from collections.abc import Mapping, MutableMapping, Sequence from pathlib import Path from time import time from .base_indexer import BaseIndexer from typing import ( DefaultDict, Deque, - Dict, List, - Mapping, - MutableMapping, Optional, Sequence, Union, @@ -40,20 +36,16 @@ from tqdm.auto import tqdm from .classification_metrics import ( - ClassificationMetric, F1Score, make_classification_metric, ) # internal -from .distances import Distance, distance_canonicalizer +from .distances import Distance from .evaluators import Evaluator, MemoryEvaluator -from .matchers import ClassificationMatch, make_classification_matcher -from .retrieval_metrics import RetrievalMetric -from .search import NMSLibSearch, Search, make_search +from .search import NMSLibSearch, Search, make_search, LinearSearch from .stores import MemoryStore, Store -from .types import CalibrationResults, FloatTensor, Lookup, PandasDataFrame, Tensor -from .utils import unpack_lookup_distances, unpack_lookup_labels +from .types import FloatTensor, Lookup, PandasDataFrame, Tensor class Indexer(BaseIndexer): @@ -134,7 +126,7 @@ def _init_structures(self) -> None: if self.search_type == "nmslib": self.search: Search = NMSLibSearch(distance=self.distance, dim=self.embedding_size) elif self.search_type == "linear": - self.search = LinearSearch(distance=self.distance, dim=embedding_size) + self.search = LinearSearch(distance=self.distance, dim=self.embedding_size) elif isinstance(self.search_type, Search): self.search = self.search_type else: @@ -157,8 +149,8 @@ def _init_structures(self) -> None: raise ValueError("You need to either supply a know evaluator name " "or an Evaluator() object") # stats - self._stats: defaultdict[str, int] = defaultdict(int) - self._lookup_timings_buffer: deque[float] = deque([], maxlen=self.stat_buffer_size) + self._stats: DefaultDict[str, int] = defaultdict(int) + self._lookup_timings_buffer: Deque[float] = deque([], maxlen=self.stat_buffer_size) # calibration data self.is_calibrated = False @@ -206,7 +198,7 @@ def _get_embeddings(self, predictions: FloatTensor) -> FloatTensor: embeddings = predictions return embeddings - def _cast_label(self, label: int | None) -> int | None: + def _cast_label(self, label: Optional[int]) -> Optional[int]: if label is not None: label = int(label) return label @@ -214,7 +206,7 @@ def _cast_label(self, label: int | None) -> int | None: def add( self, prediction: FloatTensor, - label: int | None = None, + label: Optional[int] = None, data: Tensor = None, build: bool = True, verbose: int = 1, @@ -251,8 +243,8 @@ def add( def batch_add( self, predictions: FloatTensor, - labels: Sequence[int] | None = None, - data: Tensor | None = None, + labels: Optional[Sequence[int]] = None, + data: Optional[Tensor] = None, build: bool = True, verbose: int = 1, ): @@ -282,7 +274,7 @@ def batch_add( idxs = self.kv_store.batch_add(embeddings, labels, data) self.search.batch_add(embeddings, idxs, build=build, verbose=verbose) - def single_lookup(self, prediction: FloatTensor, k: int = 5) -> list[Lookup]: + def single_lookup(self, prediction: FloatTensor, k: int = 5) -> List[Lookup]: """Find the k closest matches of a given embedding Args: @@ -317,7 +309,7 @@ def single_lookup(self, prediction: FloatTensor, k: int = 5) -> list[Lookup]: self._stats["num_lookups"] += 1 return lookups - def batch_lookup(self, predictions: FloatTensor, k: int = 5, verbose: int = 1) -> list[list[Lookup]]: + def batch_lookup(self, predictions: FloatTensor, k: int = 5, verbose: int = 1) -> List[List[Lookup]]: """Find the k closest matches for a set of embeddings diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index 754e740f..85037b46 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -1,6 +1,6 @@ """The module to handle FAISS search.""" -from collections.abc import Mapping, Sequence +from collections.abc import Sequence from termcolor import cprint from .search import Search import faiss @@ -121,8 +121,8 @@ def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[i if self.normalize: faiss.normalize_L2(embeddings) - D, I = self.index.search(embeddings, k) - return I, D + sims, indices = self.index.search(embeddings, k) + return indices, sims def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[float]]: """Find embedding K nearest neighboors embeddings. @@ -134,8 +134,8 @@ def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[fl int_embedding = np.array([embedding], dtype=np.float32) if self.normalize: faiss.normalize_L2(int_embedding) - D, I = self.index.search(int_embedding, k) - return I[0], D[0] + sims, indices = self.index.search(int_embedding, k) + return indices[0], sims[0] def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): """Add a single embedding to the search index. diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index 7cbac45e..908b2cc6 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -4,7 +4,6 @@ from .search import Search from tensorflow_similarity.distances import Distance from tensorflow_similarity.types import FloatTensor -from pathlib import Path from typing import Any import numpy as np import tensorflow as tf @@ -147,6 +146,7 @@ def save(self, path: str): """ with open(self.__make_file_path(path), "wb") as f: pickle.dump((self.db, self.ids), f) + self.__save_config(path) def load(self, path: str): """load index on disk @@ -162,7 +162,7 @@ def load(self, path: str): def __make_config_path(self, path): return path / "config.json" - def __save_config(self): + def __save_config(self, path): with open(self.__make_config_file_path(path), "wt") as f: json.dump(self.get_config(), f) diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index b17f3564..d64fe386 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -13,13 +13,9 @@ # limitations under the License. from __future__ import annotations -import io from collections.abc import Sequence from pathlib import Path -import numpy as np -import pandas as pd -import tensorflow as tf import pickle import shutil import dbm @@ -205,7 +201,7 @@ def load(self, path: str) -> int: self.__load_config(path) num_shards = int(math.ceil(self.num_items / self.shard_size)) self.path = path - for i in range(self.num_items): + for i in range(num_shards): self.__add_new_shard() return self.size() diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py index 14f57632..387234d8 100644 --- a/tensorflow_similarity/stores/redis_store.py +++ b/tensorflow_similarity/stores/redis_store.py @@ -15,6 +15,7 @@ from collections.abc import Sequence +import json import redis from .store import Store From 01abbe1fd4a2f27ff1a12b57018918efe02c0aab Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Tue, 28 Feb 2023 09:21:27 -0800 Subject: [PATCH 04/35] fix the backward compatibility issue --- tensorflow_similarity/base_indexer.py | 2 ++ tensorflow_similarity/search/faiss_search.py | 2 ++ tensorflow_similarity/search/linear_search.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index 4ef65861..b4970529 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from abc import ABC, abstractmethod import numpy as np import tensorflow as tf diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index 85037b46..68dbe8ef 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -1,5 +1,7 @@ """The module to handle FAISS search.""" +from __future__ import annotations + from collections.abc import Sequence from termcolor import cprint from .search import Search diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index 908b2cc6..911e6bba 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -1,5 +1,7 @@ """The module to handle Linear search.""" +from __future__ import annotations + from collections.abc import Sequence from .search import Search from tensorflow_similarity.distances import Distance From f7f3576bba45bdbcf2d1ca15f1147369208eaca6 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Tue, 28 Feb 2023 09:32:03 -0800 Subject: [PATCH 05/35] add dependencies --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index dc5f63a0..a211c40a 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,8 @@ def get_version(rel_path): "dev": [ "flake8", "black", + "faiss", + "faiss-gpu", "pre-commit", "isort", "mkdocs", @@ -81,6 +83,7 @@ def get_version(rel_path): "mypy<=0.982", "pytest", "pytype", + "redis", "setuptools", "types-termcolor", "twine", From b6508b18c7fd5296493b103f511985ed09839535 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Tue, 28 Feb 2023 09:35:51 -0800 Subject: [PATCH 06/35] remove dependencies --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index a211c40a..e5f253f1 100644 --- a/setup.py +++ b/setup.py @@ -72,7 +72,6 @@ def get_version(rel_path): "dev": [ "flake8", "black", - "faiss", "faiss-gpu", "pre-commit", "isort", From 672378f5e09a9a8b4c2dfd9644c386a613aeaea4 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Tue, 28 Feb 2023 20:47:33 -0800 Subject: [PATCH 07/35] fixed typing issues --- setup.py | 1 + tensorflow_similarity/base_indexer.py | 17 +++++++++++++--- tensorflow_similarity/indexer.py | 8 -------- tensorflow_similarity/search/faiss_search.py | 1 - tensorflow_similarity/search/linear_search.py | 8 ++++---- tensorflow_similarity/stores/cached_store.py | 7 +++++-- tensorflow_similarity/stores/redis_store.py | 20 +++++++++++-------- 7 files changed, 36 insertions(+), 26 deletions(-) diff --git a/setup.py b/setup.py index e5f253f1..480d7391 100644 --- a/setup.py +++ b/setup.py @@ -84,6 +84,7 @@ def get_version(rel_path): "pytype", "redis", "setuptools", + "types-redis", "types-termcolor", "twine", "types-tabulate", diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index b4970529..b9fe6eff 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -31,6 +31,16 @@ def __init__(self, distance, embedding_output, embedding_size, evaluator, stat_b # internal structure naming # FIXME support custom objects self.evaluator_type = evaluator + + self.evaluator: Optional[Evaluator] = None + + # code used to evaluate indexer performance + if self.evaluator_type == "memory": + self.evaluator: Evaluator = MemoryEvaluator() + elif isinstance(self.evaluator_type, Evaluator): + self.evaluator: Evaluator = self.evaluator_type + else: + raise ValueError("You need to either supply a know evaluator name " "or an Evaluator() object") # stats configuration self.stat_buffer_size = stat_buffer_size @@ -92,11 +102,12 @@ def evaluate_retrieval( lookups = self.batch_lookup(predictions, k=k, verbose=verbose) # Evaluate them - return self.evaluator.evaluate_retrieval( + eval_ret : dict[str, np.ndarray] = self.evaluator.evaluate_retrieval( retrieval_metrics=retrieval_metrics, target_labels=target_labels, lookups=lookups, ) + return eval_ret def evaluate_classification( self, @@ -154,7 +165,7 @@ def evaluate_classification( dtype=lookup_distances.dtype, ) - results = self.evaluator.evaluate_classification( + results : dict[str, np.ndarray] = self.evaluator.evaluate_classification( query_labels=query_labels, lookup_labels=lookup_labels, lookup_distances=lookup_distances, @@ -229,7 +240,7 @@ def calibrate( combined_metrics: list[ClassificationMetric] = [make_classification_metric(m) for m in extra_metrics] # running calibration - calibration_results = self.evaluator.calibrate( + calibration_results : CalibrationResults = self.evaluator.calibrate( target_labels=target_labels, lookups=lookups, thresholds_targets=thresholds_targets, diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index c6755b62..239cf15f 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -140,14 +140,6 @@ def _init_structures(self) -> None: else: raise ValueError("You need to either supply a know key value " "store name or a Store() object") - # code used to evaluate indexer performance - if self.evaluator_type == "memory": - self.evaluator: Evaluator = MemoryEvaluator() - elif isinstance(self.evaluator_type, Evaluator): - self.evaluator = self.evaluator_type - else: - raise ValueError("You need to either supply a know evaluator name " "or an Evaluator() object") - # stats self._stats: DefaultDict[str, int] = defaultdict(int) self._lookup_timings_buffer: Deque[float] = deque([], maxlen=self.stat_buffer_size) diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index 68dbe8ef..739f0fdd 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -60,7 +60,6 @@ def __init__( f"| - nlist: {self.nlist}", f"| - nprobe: {self.nprobe}", f"| - normalize: {self.normalize}", - f"| - query_params: {self.query_params}", ] cprint("\n".join(t_msg) + "\n", "green") diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index 911e6bba..e0b9067f 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -6,7 +6,7 @@ from .search import Search from tensorflow_similarity.distances import Distance from tensorflow_similarity.types import FloatTensor -from typing import Any +from typing import Any, List import numpy as np import tensorflow as tf import pickle @@ -52,7 +52,7 @@ def __init__( ] cprint("\n".join(t_msg) + "\n", "green") self.db = np.empty((INITIAL_DB_SIZE, dim), dtype=np.float32) - self.ids = [] + self.ids: List[int] = [] def is_built(self): return True @@ -73,7 +73,7 @@ def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[i sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) similarity, id_idxs = tf.math.top_k(sims, k) ids_array = np.array(self.ids) - return np.array([ids_array[x.numpy()] for x in id_idxs]), similarity + return list(np.array([ids_array[x.numpy()] for x in id_idxs])), list(similarity) def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[float]]: """Find embedding K nearest neighboors embeddings. @@ -87,7 +87,7 @@ def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[fl sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) similarity, id_idxs = tf.math.top_k(sims, k) ids_array = np.array(self.ids) - return np.array(ids_array[id_idxs[0].numpy()]), similarity[0] + return list(np.array(ids_array[id_idxs[0].numpy()])), list(similarity[0]) def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): """Add a single embedding to the search index. diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index d64fe386..9e86f40b 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -21,6 +21,7 @@ import dbm import json import math +import pandas as pd from tensorflow_similarity.types import FloatTensor, PandasDataFrame, Tensor @@ -215,7 +216,9 @@ def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: Defaults to 0 (unlimited). Returns: - None + Empty DataFrame """ - return None + # forcing type from Any to PandasFrame + df: PandasDataFrame = pd.DataFrame() + return df diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py index 387234d8..644736b1 100644 --- a/tensorflow_similarity/stores/redis_store.py +++ b/tensorflow_similarity/stores/redis_store.py @@ -16,6 +16,8 @@ from collections.abc import Sequence import json +import pandas as pd +import pickle import redis from .store import Store @@ -51,14 +53,14 @@ def add( Returns: Associated record id. """ - num_items = self.__conn.incr("num_items") + num_items = int(self.__conn.incr("num_items")) idx = num_items - 1 - self.__conn.set(idx, (embedding, label, data)) + self.__conn.set(idx, pickle.dumps((embedding, label, data))) return idx - def get_num_items(self): - return self.__conn.get("num_items") or 0 + def get_num_items(self) -> int: + return int(self.__conn.get("num_items")) or 0 def batch_add( self, @@ -100,7 +102,8 @@ def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: record associated with the requested id. """ - return self.__conn.get(str(idx)) + ret = pickle.loads(self.__conn.get(idx)) + return ret[0], ret[1], ret[2] def batch_get(self, idxs: Sequence[int]) -> tuple[list[FloatTensor], list[int | None], list[Tensor | None]]: """Get embedding records from the key value store. @@ -181,7 +184,8 @@ def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: Defaults to 0 (unlimited). Returns: - None + Empty DataFrame """ - - return None + # forcing type from Any to PandasFrame + df: PandasDataFrame = pd.DataFrame() + return df From 5ef37420da006bac32318bf5be65425e13b664ca Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Tue, 28 Feb 2023 21:53:18 -0800 Subject: [PATCH 08/35] remove extra typing --- .pre-commit-config.yaml | 2 +- tensorflow_similarity/base_indexer.py | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8dfcf5af..2003b47e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ repos: stages: ['commit'] - repo: https://github.com/pycqa/isort - rev: 5.10.1 + rev: 5.12.0 hooks: - id: isort name: isort (python) diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index b9fe6eff..d50cc68c 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -1,24 +1,24 @@ from __future__ import annotations from abc import ABC, abstractmethod +from collections import defaultdict +from collections.abc import Mapping, MutableMapping, Sequence + import numpy as np import tensorflow as tf -from .types import CalibrationResults, FloatTensor, Lookup, Tensor -from collections.abc import Mapping, MutableMapping, Sequence -from .retrieval_metrics import RetrievalMetric -from .distances import distance_canonicalizer -from .matchers import ClassificationMatch, make_classification_matcher -from .utils import unpack_lookup_distances, unpack_lookup_labels -from collections import defaultdict +from tabulate import tabulate from tqdm.auto import tqdm - from .classification_metrics import ( ClassificationMetric, F1Score, make_classification_metric, ) -from tabulate import tabulate +from .distances import distance_canonicalizer +from .matchers import ClassificationMatch, make_classification_matcher +from .retrieval_metrics import RetrievalMetric +from .types import CalibrationResults, FloatTensor, Lookup, Tensor +from .utils import unpack_lookup_distances, unpack_lookup_labels class BaseIndexer(ABC): @@ -31,8 +31,6 @@ def __init__(self, distance, embedding_output, embedding_size, evaluator, stat_b # internal structure naming # FIXME support custom objects self.evaluator_type = evaluator - - self.evaluator: Optional[Evaluator] = None # code used to evaluate indexer performance if self.evaluator_type == "memory": @@ -102,7 +100,7 @@ def evaluate_retrieval( lookups = self.batch_lookup(predictions, k=k, verbose=verbose) # Evaluate them - eval_ret : dict[str, np.ndarray] = self.evaluator.evaluate_retrieval( + eval_ret: dict[str, np.ndarray] = self.evaluator.evaluate_retrieval( retrieval_metrics=retrieval_metrics, target_labels=target_labels, lookups=lookups, @@ -165,7 +163,7 @@ def evaluate_classification( dtype=lookup_distances.dtype, ) - results : dict[str, np.ndarray] = self.evaluator.evaluate_classification( + results: dict[str, np.ndarray] = self.evaluator.evaluate_classification( query_labels=query_labels, lookup_labels=lookup_labels, lookup_distances=lookup_distances, @@ -240,7 +238,7 @@ def calibrate( combined_metrics: list[ClassificationMetric] = [make_classification_metric(m) for m in extra_metrics] # running calibration - calibration_results : CalibrationResults = self.evaluator.calibrate( + calibration_results: CalibrationResults = self.evaluator.calibrate( target_labels=target_labels, lookups=lookups, thresholds_targets=thresholds_targets, From e81166740d6438c69e6c92c9ea590fb32e8944e4 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Tue, 28 Feb 2023 21:58:35 -0800 Subject: [PATCH 09/35] move evaluator --- tensorflow_similarity/base_indexer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index d50cc68c..bdaa0d46 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -15,6 +15,7 @@ make_classification_metric, ) from .distances import distance_canonicalizer +from .evaluators import Evaluator, MemoryEvaluator from .matchers import ClassificationMatch, make_classification_matcher from .retrieval_metrics import RetrievalMetric from .types import CalibrationResults, FloatTensor, Lookup, Tensor From 3507f2f8d13415e1ac2713fc39b626e98ac59e84 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 08:57:12 -0800 Subject: [PATCH 10/35] fix tests --- tensorflow_similarity/search/linear_search.py | 15 +++++++++------ tensorflow_similarity/stores/redis_store.py | 13 +++++++------ tests/search/test_linear_search.py | 3 ++- tests/stores/test_redis_store.py | 13 +++++++++---- 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index e0b9067f..12bc862e 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -2,17 +2,20 @@ from __future__ import annotations +import json +import pickle from collections.abc import Sequence -from .search import Search -from tensorflow_similarity.distances import Distance -from tensorflow_similarity.types import FloatTensor from typing import Any, List + import numpy as np import tensorflow as tf -import pickle -import json from termcolor import cprint +from tensorflow_similarity.distances import Distance +from tensorflow_similarity.types import FloatTensor + +from .search import Search + INITIAL_DB_SIZE = 10000 DB_SIZE_STEPS = 10000 @@ -165,7 +168,7 @@ def __make_config_path(self, path): return path / "config.json" def __save_config(self, path): - with open(self.__make_config_file_path(path), "wt") as f: + with open(self.__make_config_path(path), "wt") as f: json.dump(self.get_config(), f) def get_config(self) -> dict[str, Any]: diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py index 644736b1..ac81c884 100644 --- a/tensorflow_similarity/stores/redis_store.py +++ b/tensorflow_similarity/stores/redis_store.py @@ -13,17 +13,17 @@ # limitations under the License. from __future__ import annotations +import json +import pickle from collections.abc import Sequence -import json import pandas as pd -import pickle import redis -from .store import Store - from tensorflow_similarity.types import FloatTensor, PandasDataFrame, Tensor +from .store import Store + class RedisStore(Store): """Efficient Redis dataset store""" @@ -102,8 +102,9 @@ def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: record associated with the requested id. """ - ret = pickle.loads(self.__conn.get(idx)) - return ret[0], ret[1], ret[2] + ret_bytes: bytes = self.__conn.get(idx) + ret: tuple = pickle.loads(ret_bytes) + return (ret[0], ret[1], ret[2]) def batch_get(self, idxs: Sequence[int]) -> tuple[list[FloatTensor], list[int | None], list[Tensor | None]]: """Get embedding records from the key value store. diff --git a/tests/search/test_linear_search.py b/tests/search/test_linear_search.py index bad1bbe3..1f85121c 100644 --- a/tests/search/test_linear_search.py +++ b/tests/search/test_linear_search.py @@ -98,4 +98,5 @@ def test_running_larger_batches(): last_idx += index_size search_index.batch_add(embs, idxs) found_idxs, found_dists = search_index.batch_lookup(targets, 2) - assert found_idxs.shape == (10, 2) + assert len(found_idxs) == 10 + assert len(found_idxs[0]) == 2 diff --git a/tests/stores/test_redis_store.py b/tests/stores/test_redis_store.py index d739ebde..975293f6 100644 --- a/tests/stores/test_redis_store.py +++ b/tests/stores/test_redis_store.py @@ -1,7 +1,8 @@ -from unittest.mock import MagicMock -from unittest.mock import patch +import pickle +from unittest.mock import MagicMock, patch import numpy as np + from tensorflow_similarity.stores import RedisStore @@ -17,7 +18,8 @@ def build_store(records): @patch("redis.Redis", return_value=MagicMock()) def test_store_and_retrieve(mock_redis): records = [[[0.1, 0.2], 1, [0, 0, 0]], [[0.2, 0.3], 2, [0, 0, 0]]] - mock_redis.return_value.get.side_effect = records + serialized_records = [pickle.dumps(x) for x in records] + mock_redis.return_value.get.side_effect = serialized_records mock_redis.return_value.incr.side_effect = [1, 2, 3, 4, 5] kv_store, idxs = build_store(records) @@ -41,7 +43,10 @@ def test_batch_add(mock_redis): lbls = np.array([1, 2]) data = np.array([[0, 0, 0], [1, 1, 1]]) - mock_redis.return_value.get.side_effect = [[embs[i], lbls[i], data[i]] for i in range(2)] + records = [[embs[i], lbls[i], data[i]] for i in range(2)] + + serialized_records = [pickle.dumps(r) for r in records] + mock_redis.return_value.get.side_effect = serialized_records mock_redis.return_value.incr.side_effect = [1, 2, 3, 4, 5] kv_store = RedisStore() From 3c131d1a39da60253cbe68dd2918bd08cda2e4f5 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 11:52:05 -0800 Subject: [PATCH 11/35] switch from dbm to shelve --- tensorflow_similarity/stores/cached_store.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index 9e86f40b..26b6c063 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -13,14 +13,13 @@ # limitations under the License. from __future__ import annotations +import json +import math +import shelve +import shutil from collections.abc import Sequence from pathlib import Path -import pickle -import shutil -import dbm -import json -import math import pandas as pd from tensorflow_similarity.types import FloatTensor, PandasDataFrame, Tensor @@ -43,7 +42,7 @@ def __get_shard_file_path(self, shard_no): return f"{self.path}/cache{shard_no}" def __make_new_shard(self, shard_no: int): - return dbm.open(self.__get_shard_file_path(shard_no), "c") + return shelve.open(self.__get_shard_file_path(shard_no), "c") def __add_new_shard(self): shard_no = len(self.db) @@ -75,7 +74,7 @@ def add( shard_no = idx // self.shard_size if len(self.db) <= shard_no: self.__add_new_shard() - self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, data)) + self.db[shard_no][str(idx)] = (embedding, label, data) self.num_items += 1 return idx @@ -108,7 +107,7 @@ def batch_add( shard_no = idx // self.shard_size if len(self.db) <= shard_no: self.__add_new_shard() - self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, rec_data)) + self.db[shard_no][str(idx)] = (embedding, label, rec_data) idxs.append(idx) return idxs @@ -124,7 +123,7 @@ def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: """ shard_no = idx // self.shard_size - embedding, label, data = pickle.loads(self.db[shard_no][str(idx)]) + embedding, label, data = self.db[shard_no][str(idx)] return embedding, label, data def batch_get(self, idxs: Sequence[int]) -> tuple[list[FloatTensor], list[int | None], list[Tensor | None]]: From 92502acea1430755f957e818f8a8b10d15bad5fb Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 12:31:22 -0800 Subject: [PATCH 12/35] set temp dir for storing cached store --- tensorflow_similarity/stores/cached_store.py | 4 +-- tests/stores/test_cached_store.py | 27 ++++++++++++-------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index 26b6c063..be00f53b 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -30,13 +30,13 @@ class CachedStore(Store): """Efficient cached dataset store""" - def __init__(self, shard_size=1000000) -> None: + def __init__(self, shard_size=1000000, path=".") -> None: # We are using a native python cached dictionary # db[id] = pickle((embedding, label, data)) self.db: list[dict[str, str]] = [] self.shard_size = shard_size self.num_items: int = 0 - self.path: str = "." + self.path: str = path def __get_shard_file_path(self, shard_no): return f"{self.path}/cache{shard_no}" diff --git a/tests/stores/test_cached_store.py b/tests/stores/test_cached_store.py index 036b8c1f..a5d67d17 100644 --- a/tests/stores/test_cached_store.py +++ b/tests/stores/test_cached_store.py @@ -1,10 +1,12 @@ +import os + import numpy as np from tensorflow_similarity.stores import CachedStore -def build_store(records): - kv_store = CachedStore() +def build_store(records, path): + kv_store = CachedStore(path=path) idxs = [] for r in records: idx = kv_store.add(r[0], r[1], r[2]) @@ -12,10 +14,10 @@ def build_store(records): return kv_store, idxs -def test_cached_store_and_retrieve(): +def test_cached_store_and_retrieve(tmp_path): records = [[[0.1, 0.2], 1, [0, 0, 0]], [[0.2, 0.3], 2, [0, 0, 0]]] - kv_store, idxs = build_store(records) + kv_store, idxs = build_store(records, tmp_path) # check index numbering for gt, idx in enumerate(idxs): @@ -33,12 +35,12 @@ def test_cached_store_and_retrieve(): assert dt == records[idx][2] -def test_batch_add(): +def test_batch_add(tmp_path): embs = np.array([[0.1, 0.2], [0.2, 0.3]]) lbls = np.array([1, 2]) data = np.array([[0, 0, 0], [1, 1, 1]]) - kv_store = CachedStore() + kv_store = CachedStore(path=tmp_path) idxs = kv_store.batch_add(embs, lbls, data) for idx in idxs: emb, lbl, dt = kv_store.get(idx) @@ -50,13 +52,18 @@ def test_batch_add(): def test_save_and_reload(tmp_path): records = [[[0.1, 0.2], 1, [0, 0, 0]], [[0.2, 0.3], 2, [0, 0, 0]]] - kv_store, idxs = build_store(records) - kv_store.save(tmp_path) + save_path = tmp_path / "save" + os.mkdir(save_path) + obj_path = tmp_path / "obj" + os.mkdir(obj_path) + + kv_store, idxs = build_store(records, obj_path) + kv_store.save(save_path) # reload reloaded_store = CachedStore() - print(f"loading from {tmp_path}") - reloaded_store.load(tmp_path) + print(f"loading from {save_path}") + reloaded_store.load(save_path) assert reloaded_store.size() == 2 From 71ebfdcf35f504717be60a26ee0872672e343b26 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 13:05:00 -0800 Subject: [PATCH 13/35] add debug logging --- tests/stores/test_cached_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/stores/test_cached_store.py b/tests/stores/test_cached_store.py index a5d67d17..80f2dd98 100644 --- a/tests/stores/test_cached_store.py +++ b/tests/stores/test_cached_store.py @@ -58,6 +58,7 @@ def test_save_and_reload(tmp_path): os.mkdir(obj_path) kv_store, idxs = build_store(records, obj_path) + logging.info(f"obj_path={os.listdir(obj_path)}\nsave_path={os.listdir(save_path)}") kv_store.save(save_path) # reload From 2f16f5a0fe614a98677846b559820596afcf15af Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 13:19:54 -0800 Subject: [PATCH 14/35] add debug logging --- tests/stores/test_cached_store.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/stores/test_cached_store.py b/tests/stores/test_cached_store.py index 80f2dd98..31f26c3a 100644 --- a/tests/stores/test_cached_store.py +++ b/tests/stores/test_cached_store.py @@ -1,3 +1,4 @@ +import logging import os import numpy as np From 1c9728d5143b4a28dac6fd0d1d7eae130b69a50d Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 14:07:07 -0800 Subject: [PATCH 15/35] specify dbm implementation for cross-machine compatibility --- tensorflow_similarity/stores/cached_store.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index be00f53b..19296ad8 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -13,9 +13,10 @@ # limitations under the License. from __future__ import annotations +import dbm import json import math -import shelve +import pickle import shutil from collections.abc import Sequence from pathlib import Path @@ -42,7 +43,7 @@ def __get_shard_file_path(self, shard_no): return f"{self.path}/cache{shard_no}" def __make_new_shard(self, shard_no: int): - return shelve.open(self.__get_shard_file_path(shard_no), "c") + return dbm.ndbm.open(self.__get_shard_file_path(shard_no), "c") def __add_new_shard(self): shard_no = len(self.db) @@ -74,7 +75,7 @@ def add( shard_no = idx // self.shard_size if len(self.db) <= shard_no: self.__add_new_shard() - self.db[shard_no][str(idx)] = (embedding, label, data) + self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, data)) self.num_items += 1 return idx @@ -107,7 +108,7 @@ def batch_add( shard_no = idx // self.shard_size if len(self.db) <= shard_no: self.__add_new_shard() - self.db[shard_no][str(idx)] = (embedding, label, rec_data) + self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, rec_data)) idxs.append(idx) return idxs @@ -123,7 +124,7 @@ def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: """ shard_no = idx // self.shard_size - embedding, label, data = self.db[shard_no][str(idx)] + embedding, label, data = pickle.loads(self.db[shard_no][str(idx)]) return embedding, label, data def batch_get(self, idxs: Sequence[int]) -> tuple[list[FloatTensor], list[int | None], list[Tensor | None]]: From c97e41295083b4fe19420d58963174c7f4f38eee Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 14:20:54 -0800 Subject: [PATCH 16/35] switch to ndb.dumb as other options not available on all machines --- tensorflow_similarity/stores/cached_store.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index 19296ad8..406673d9 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import annotations -import dbm +import dbm.dumb import json import math import pickle @@ -43,7 +43,7 @@ def __get_shard_file_path(self, shard_no): return f"{self.path}/cache{shard_no}" def __make_new_shard(self, shard_no: int): - return dbm.ndbm.open(self.__get_shard_file_path(shard_no), "c") + return dbm.dumb.open(self.__get_shard_file_path(shard_no), "c") def __add_new_shard(self): shard_no = len(self.db) @@ -156,7 +156,9 @@ def __close_all_shards(self): def __copy_shards(self, path): for shard_no in range(len(self.db)): - shutil.copy(Path(self.__get_shard_file_path(shard_no)).with_suffix(".db"), path) + shutil.copy(Path(self.__get_shard_file_path(shard_no)).with_suffix(".bak"), path) + shutil.copy(Path(self.__get_shard_file_path(shard_no)).with_suffix(".dat"), path) + shutil.copy(Path(self.__get_shard_file_path(shard_no)).with_suffix(".dir"), path) def __make_config_file_path(self, path): return path / "config.json" From b520f457ca8d5ba6131d91a0aef7ce91ee7d04d4 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 14:45:53 -0800 Subject: [PATCH 17/35] fix import orders --- tensorflow_similarity/indexer.py | 18 ++++-------------- tensorflow_similarity/search/faiss_search.py | 11 +++++++---- tensorflow_similarity/stores/__init__.py | 4 ++-- 3 files changed, 13 insertions(+), 20 deletions(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 239cf15f..f9193ae7 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -20,30 +20,20 @@ from collections import defaultdict, deque from pathlib import Path from time import time -from .base_indexer import BaseIndexer -from typing import ( - DefaultDict, - Deque, - List, - Optional, - Sequence, - Union, -) +from typing import DefaultDict, Deque, List, Optional, Sequence, Union import numpy as np import tensorflow as tf from tabulate import tabulate from tqdm.auto import tqdm -from .classification_metrics import ( - F1Score, - make_classification_metric, -) +from .base_indexer import BaseIndexer +from .classification_metrics import F1Score, make_classification_metric # internal from .distances import Distance from .evaluators import Evaluator, MemoryEvaluator -from .search import NMSLibSearch, Search, make_search, LinearSearch +from .search import LinearSearch, NMSLibSearch, Search, make_search from .stores import MemoryStore, Store from .types import FloatTensor, Lookup, PandasDataFrame, Tensor diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index 739f0fdd..24e42307 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -3,14 +3,17 @@ from __future__ import annotations from collections.abc import Sequence -from termcolor import cprint -from .search import Search +from pathlib import Path +from typing import Any + import faiss import numpy as np +from termcolor import cprint + from tensorflow_similarity.distances import Distance from tensorflow_similarity.types import FloatTensor -from pathlib import Path -from typing import Any + +from .search import Search class FaissSearch(Search): diff --git a/tensorflow_similarity/stores/__init__.py b/tensorflow_similarity/stores/__init__.py index a7c71e31..9a1950cb 100644 --- a/tensorflow_similarity/stores/__init__.py +++ b/tensorflow_similarity/stores/__init__.py @@ -27,7 +27,7 @@ via the `to_pandas()` method. """ -from .memory_store import MemoryStore # noqa -from .store import Store # noqa from .cached_store import CachedStore # noqa +from .memory_store import MemoryStore # noqa from .redis_store import RedisStore # noqa +from .store import Store # noqa From 185d5b91e32e51d7badba26f7fd891d31cc1cb3b Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 1 Mar 2023 19:53:12 -0800 Subject: [PATCH 18/35] remove extraneous logging --- tests/stores/test_cached_store.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/stores/test_cached_store.py b/tests/stores/test_cached_store.py index 31f26c3a..a5d67d17 100644 --- a/tests/stores/test_cached_store.py +++ b/tests/stores/test_cached_store.py @@ -1,4 +1,3 @@ -import logging import os import numpy as np @@ -59,7 +58,6 @@ def test_save_and_reload(tmp_path): os.mkdir(obj_path) kv_store, idxs = build_store(records, obj_path) - logging.info(f"obj_path={os.listdir(obj_path)}\nsave_path={os.listdir(save_path)}") kv_store.save(save_path) # reload From 0e28daa294c4c787f78bcb2167566137199f2c16 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 13:13:47 -0800 Subject: [PATCH 19/35] ensure only names are stored in metadata --- tensorflow_similarity/indexer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index f9193ae7..325ecb6e 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -101,8 +101,8 @@ def __init__( super().__init__(distance, embedding_output, embedding_size, evaluator, stat_buffer_size) # internal structure naming # FIXME support custom objects - self.search_type = search - self.kv_store_type = kv_store + self.search_type = search if isinstance(search, str) else type(search).__name__ + self.kv_store_type = kv_store if isinstance(kv_store, str) else type(kv_store).__name__ # initialize internal structures self._init_structures() From 807a41a4280565782b2fdea1870c740bd8dc7126 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 13:25:37 -0800 Subject: [PATCH 20/35] separate store from store_type, and search from search_type, needed for serialization of metadata --- tensorflow_similarity/indexer.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 325ecb6e..db696d06 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -102,7 +102,11 @@ def __init__( # internal structure naming # FIXME support custom objects self.search_type = search if isinstance(search, str) else type(search).__name__ + if isinstance(search, Search): + self.search = search self.kv_store_type = kv_store if isinstance(kv_store, str) else type(kv_store).__name__ + if isinstance(kv_store, Store): + self.kv_store = kv_store # initialize internal structures self._init_structures() @@ -117,9 +121,8 @@ def _init_structures(self) -> None: self.search: Search = NMSLibSearch(distance=self.distance, dim=self.embedding_size) elif self.search_type == "linear": self.search = LinearSearch(distance=self.distance, dim=self.embedding_size) - elif isinstance(self.search_type, Search): - self.search = self.search_type - else: + elif not isinstance(self.search, Search): + # self.search should have been already initialized raise ValueError("You need to either supply a known search " "framework name or a Search() object") # mapper from id to record data @@ -127,7 +130,8 @@ def _init_structures(self) -> None: self.kv_store: Store = MemoryStore() elif isinstance(self.kv_store_type, Store): self.kv_store = self.kv_store_type - else: + elif not isinstance(self.kv_store, Store): + # self.kv_store should have been already initialized raise ValueError("You need to either supply a know key value " "store name or a Store() object") # stats From 2b5169816fd33a5c35e871e83fbf1e90ee1b53d5 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 13:37:56 -0800 Subject: [PATCH 21/35] use str path --- tensorflow_similarity/search/linear_search.py | 5 +++-- tensorflow_similarity/stores/cached_store.py | 2 +- tensorflow_similarity/stores/redis_store.py | 3 ++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index 12bc862e..8a3d5131 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -5,6 +5,7 @@ import json import pickle from collections.abc import Sequence +from pathlib import Path from typing import Any, List import numpy as np @@ -141,7 +142,7 @@ def batch_add( self.db[items : items + len(embeddings)] = int_embeddings def __make_file_path(self, path): - return path / "index.pickle" + return Path(path) / "index.pickle" def save(self, path: str): """Serializes the index data on disk @@ -165,7 +166,7 @@ def load(self, path: str): self.ids = data[1] def __make_config_path(self, path): - return path / "config.json" + return Path(path) / "config.json" def __save_config(self, path): with open(self.__make_config_path(path), "wt") as f: diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index 406673d9..a090f9a3 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -161,7 +161,7 @@ def __copy_shards(self, path): shutil.copy(Path(self.__get_shard_file_path(shard_no)).with_suffix(".dir"), path) def __make_config_file_path(self, path): - return path / "config.json" + return Path(path) / "config.json" def __save_config(self, path): with open(self.__make_config_file_path(path), "wt") as f: diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py index ac81c884..dfff4e7d 100644 --- a/tensorflow_similarity/stores/redis_store.py +++ b/tensorflow_similarity/stores/redis_store.py @@ -16,6 +16,7 @@ import json import pickle from collections.abc import Sequence +from pathlib import Path import pandas as pd import redis @@ -130,7 +131,7 @@ def size(self) -> int: return self.get_num_items() def __make_config_file_path(self, path): - return path / "config.json" + return Path(path) / "config.json" def __save_config(self, path): with open(self.__make_config_file_path(path), "wt") as f: From 45e3d477ca84c794b76451dfe0894e7161b578da Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 14:12:19 -0800 Subject: [PATCH 22/35] put typing in one place --- tensorflow_similarity/indexer.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index db696d06..9cf2f21f 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -100,6 +100,8 @@ def __init__( """ super().__init__(distance, embedding_output, embedding_size, evaluator, stat_buffer_size) # internal structure naming + self.search: Optional[Search] = None + self.kv_store: Optional[Store] = None # FIXME support custom objects self.search_type = search if isinstance(search, str) else type(search).__name__ if isinstance(search, Search): @@ -118,7 +120,7 @@ def _init_structures(self) -> None: "(re)initialize internal storage structure" if self.search_type == "nmslib": - self.search: Search = NMSLibSearch(distance=self.distance, dim=self.embedding_size) + self.search = NMSLibSearch(distance=self.distance, dim=self.embedding_size) elif self.search_type == "linear": self.search = LinearSearch(distance=self.distance, dim=self.embedding_size) elif not isinstance(self.search, Search): @@ -127,13 +129,18 @@ def _init_structures(self) -> None: # mapper from id to record data if self.kv_store_type == "memory": - self.kv_store: Store = MemoryStore() + self.kv_store = MemoryStore() elif isinstance(self.kv_store_type, Store): self.kv_store = self.kv_store_type elif not isinstance(self.kv_store, Store): # self.kv_store should have been already initialized raise ValueError("You need to either supply a know key value " "store name or a Store() object") + if not self.search: + raise ValueError("search not initialized") + if not self.kv_store: + raise ValueError("kv_store not initialized") + # stats self._stats: DefaultDict[str, int] = defaultdict(int) self._lookup_timings_buffer: Deque[float] = deque([], maxlen=self.stat_buffer_size) From 80e9fee00eadf6601d80af3b01dc526e1b0f110a Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 15:24:35 -0800 Subject: [PATCH 23/35] add canonical name for consistent reload --- tensorflow_similarity/indexer.py | 4 +- tensorflow_similarity/search/faiss_search.py | 4 +- tensorflow_similarity/search/linear_search.py | 3 +- tensorflow_similarity/search/utils.py | 4 ++ tensorflow_similarity/stores/__init__.py | 1 + tensorflow_similarity/stores/cached_store.py | 6 ++- tensorflow_similarity/stores/memory_store.py | 3 ++ tensorflow_similarity/stores/redis_store.py | 6 ++- tensorflow_similarity/stores/store.py | 12 +++++ tensorflow_similarity/stores/utils.py | 50 +++++++++++++++++++ 10 files changed, 85 insertions(+), 8 deletions(-) create mode 100644 tensorflow_similarity/stores/utils.py diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 9cf2f21f..348ef8d3 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -34,7 +34,7 @@ from .distances import Distance from .evaluators import Evaluator, MemoryEvaluator from .search import LinearSearch, NMSLibSearch, Search, make_search -from .stores import MemoryStore, Store +from .stores import MemoryStore, Store, make_store from .types import FloatTensor, Lookup, PandasDataFrame, Tensor @@ -381,6 +381,7 @@ def save(self, path: str, compression: bool = True): "embedding_output": self.embedding_output, "embedding_size": self.embedding_size, "kv_store": self.kv_store_type, + "kv_store_config": self.kv_store.get_config(), "evaluator": self.evaluator_type, "search_config": self.search.get_config(), "stat_buffer_size": self.stat_buffer_size, @@ -416,6 +417,7 @@ def load(path: str | Path, verbose: int = 1): metadata = tf.keras.backend.eval(metadata) md = json.loads(metadata) search = make_search(md["search_config"]) + kv_store = make_store(md["kv_store_config"]) index = Indexer( distance=md["distance"], embedding_size=md["embedding_size"], diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index 24e42307..eaa37a32 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -221,5 +221,5 @@ def get_config(self) -> dict[str, Any]: "name": self.name, "canonical_name": self.__class__.__name__, } - - return config + base_config = super().get_config() + return {**base_config, **config} diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index 8a3d5131..de61addb 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -183,4 +183,5 @@ def get_config(self) -> dict[str, Any]: "dim": self.dim, } - return config + base_config = super().get_config() + return {**base_config, **config} diff --git a/tensorflow_similarity/search/utils.py b/tensorflow_similarity/search/utils.py index aded6a35..50d561e1 100644 --- a/tensorflow_similarity/search/utils.py +++ b/tensorflow_similarity/search/utils.py @@ -15,11 +15,15 @@ from typing import Any, Type +from .faiss_search import FaissSearch +from .linear_search import LinearSearch from .nmslib_search import NMSLibSearch from .search import Search SEARCH_ALIASES: dict[str, Type[Search]] = { "NMSLibSearch": NMSLibSearch, + "LinearSearch": LinearSearch, + "FaissSearch": FaissSearch, } diff --git a/tensorflow_similarity/stores/__init__.py b/tensorflow_similarity/stores/__init__.py index 9a1950cb..edb571ab 100644 --- a/tensorflow_similarity/stores/__init__.py +++ b/tensorflow_similarity/stores/__init__.py @@ -31,3 +31,4 @@ from .memory_store import MemoryStore # noqa from .redis_store import RedisStore # noqa from .store import Store # noqa +from .utils import make_store # noqa diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index a090f9a3..5be2004b 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -167,7 +167,7 @@ def __save_config(self, path): with open(self.__make_config_file_path(path), "wt") as f: json.dump(self.get_config(), f) - def __set_config(self, num_items, shard_size): + def __set_config(self, num_items, shard_size, **kw_args): self.num_items = num_items self.shard_size = shard_size @@ -190,7 +190,9 @@ def save(self, path: str, compression: bool = True) -> None: self.__reopen_all_shards() def get_config(self): - return {"shard_size": self.shard_size, "num_items": self.num_items} + config = {"shard_size": self.shard_size, "num_items": self.num_items} + base_config = super().get_config() + return {**base_config, **config} def load(self, path: str) -> int: """load index on disk diff --git a/tensorflow_similarity/stores/memory_store.py b/tensorflow_similarity/stores/memory_store.py index 6d2de8e8..b3372f62 100644 --- a/tensorflow_similarity/stores/memory_store.py +++ b/tensorflow_similarity/stores/memory_store.py @@ -207,3 +207,6 @@ def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: # forcing type from Any to PandasFrame df: PandasDataFrame = pd.DataFrame.from_dict(data) return df + + def get_config(self): + return super().get_config() diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py index dfff4e7d..2b32988e 100644 --- a/tensorflow_similarity/stores/redis_store.py +++ b/tensorflow_similarity/stores/redis_store.py @@ -137,7 +137,7 @@ def __save_config(self, path): with open(self.__make_config_file_path(path), "wt") as f: json.dump(self.get_config(), f) - def __set_config(self, host, port, db): + def __set_config(self, host, port, db, **kw_args): self.host = host self.port = port self.db = db @@ -162,7 +162,9 @@ def save(self, path: str, compression: bool = True) -> None: self.__save_config(path) def get_config(self): - return {"host": self.host, "port": self.port, "db": self.db, "num_items": self.get_num_items()} + config = {"host": self.host, "port": self.port, "db": self.db, "num_items": self.get_num_items()} + base_config = super().get_config() + return {**base_config, **config} def load(self, path: str) -> int: """load index on disk diff --git a/tensorflow_similarity/stores/store.py b/tensorflow_similarity/stores/store.py index 7855b234..b3a29fb4 100644 --- a/tensorflow_similarity/stores/store.py +++ b/tensorflow_similarity/stores/store.py @@ -115,3 +115,15 @@ def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: Returns: pd.DataFrame: a pandas dataframe. """ + + def get_config(self) -> dict[str, Any]: + """Contains the Store configuration. + + Returns: + A Python dict containing the configuration of the Store obj. + """ + config = { + "canonical_name": self.__class__.__name__, + } + + return config diff --git a/tensorflow_similarity/stores/utils.py b/tensorflow_similarity/stores/utils.py new file mode 100644 index 00000000..ff1813b4 --- /dev/null +++ b/tensorflow_similarity/stores/utils.py @@ -0,0 +1,50 @@ +# Copyright 2021 The TensorFlow Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from typing import Any, Type + +from .cached_store import CachedStore +from .memory_store import MemoryStore +from .redis_store import RedisStore +from .store import Store + +STORE_ALIASES: dict[str, Type[Store]] = { + "RedisStore": RedisStore, + "CachedStore": CachedStore, + "MemoryStore": MemoryStore, +} + + +def make_store(config: dict[str, Any]) -> Store: + """Creates a store instance from its config. + + This method is the reverse of `get_config`, + capable of instantiating the same search from the config + + Args: + config: A Python dictionary, typically the output of get_config. + + Returns: + A Store instance. + """ + + if config["canonical_name"] in STORE_ALIASES: + config_copy = dict(config) + del config_copy["canonical_name"] + store: Store = STORE_ALIASES[config["canonical_name"]](**config_copy) + else: + raise ValueError(f"Unknown search type: {config['canonical_name']}") + + return store From d32406f2f07ed4cdf247a9394473ce02eb464073 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 16:14:39 -0800 Subject: [PATCH 24/35] accept canonical_name --- tensorflow_similarity/search/faiss_search.py | 1 + tensorflow_similarity/search/linear_search.py | 8 +------- tensorflow_similarity/stores/cached_store.py | 2 +- tensorflow_similarity/stores/memory_store.py | 2 +- tensorflow_similarity/stores/redis_store.py | 2 +- tensorflow_similarity/stores/store.py | 1 + 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index eaa37a32..11f1584e 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -34,6 +34,7 @@ def __init__( nlist=1024, nprobe=1, normalize=True, + **kw_args, ): """Initiate FAISS indexer diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index de61addb..e0403b22 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -27,13 +27,7 @@ class LinearSearch(Search): It implements the Search interface. """ - def __init__( - self, - distance: Distance | str, - dim: int, - verbose: int = 0, - name: str | None = None, - ): + def __init__(self, distance: Distance | str, dim: int, verbose: int = 0, name: str | None = None, **kw_args): """Initiate Linear indexer. Args: diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index 5be2004b..a467a086 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -31,7 +31,7 @@ class CachedStore(Store): """Efficient cached dataset store""" - def __init__(self, shard_size=1000000, path=".") -> None: + def __init__(self, shard_size=1000000, path=".", **kw_args) -> None: # We are using a native python cached dictionary # db[id] = pickle((embedding, label, data)) self.db: list[dict[str, str]] = [] diff --git a/tensorflow_similarity/stores/memory_store.py b/tensorflow_similarity/stores/memory_store.py index b3372f62..6792cf4b 100644 --- a/tensorflow_similarity/stores/memory_store.py +++ b/tensorflow_similarity/stores/memory_store.py @@ -29,7 +29,7 @@ class MemoryStore(Store): """Efficient in-memory dataset store""" - def __init__(self) -> None: + def __init__(self, **kw_args) -> None: # We are using a native python array in memory for its row speed. # Serialization / export relies on Arrow. self.labels: list[int | None] = [] diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py index 2b32988e..2cad7610 100644 --- a/tensorflow_similarity/stores/redis_store.py +++ b/tensorflow_similarity/stores/redis_store.py @@ -29,7 +29,7 @@ class RedisStore(Store): """Efficient Redis dataset store""" - def __init__(self, host="localhost", port=6379, db=0) -> None: + def __init__(self, host="localhost", port=6379, db=0, **kw_args) -> None: # Currently does not support authentication self.host = host self.port = port diff --git a/tensorflow_similarity/stores/store.py b/tensorflow_similarity/stores/store.py index b3a29fb4..37d1dd48 100644 --- a/tensorflow_similarity/stores/store.py +++ b/tensorflow_similarity/stores/store.py @@ -15,6 +15,7 @@ from abc import ABC, abstractmethod from collections.abc import Sequence +from typing import Any from tensorflow_similarity.types import FloatTensor, PandasDataFrame, Tensor From 573ec4f7b3bf37e8a9d5c02689f799407edcafdc Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 16:20:28 -0800 Subject: [PATCH 25/35] Remove optional --- tensorflow_similarity/indexer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 348ef8d3..372155a4 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -100,15 +100,13 @@ def __init__( """ super().__init__(distance, embedding_output, embedding_size, evaluator, stat_buffer_size) # internal structure naming - self.search: Optional[Search] = None - self.kv_store: Optional[Store] = None # FIXME support custom objects self.search_type = search if isinstance(search, str) else type(search).__name__ if isinstance(search, Search): - self.search = search + self.search: Search = search self.kv_store_type = kv_store if isinstance(kv_store, str) else type(kv_store).__name__ if isinstance(kv_store, Store): - self.kv_store = kv_store + self.kv_store: Store = kv_store # initialize internal structures self._init_structures() From 904f06ea8a60ad3287b66ef426fcd2e90c61abf3 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 22:54:52 -0800 Subject: [PATCH 26/35] adding more tests --- tensorflow_similarity/indexer.py | 16 +++++--- tensorflow_similarity/search/faiss_search.py | 2 +- tensorflow_similarity/stores/cached_store.py | 8 ++-- tests/test_indexer.py | 41 ++++++++++++++++++++ 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 372155a4..740fc2b4 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -17,6 +17,7 @@ from __future__ import annotations import json +import os from collections import defaultdict, deque from pathlib import Path from time import time @@ -194,6 +195,9 @@ def _cast_label(self, label: Optional[int]) -> Optional[int]: label = int(label) return label + def build_index(self, samples, **kwargss): + self.search.build_index(samples) + def add( self, prediction: FloatTensor, @@ -393,8 +397,10 @@ def save(self, path: str, compression: bool = True): metadata_fname = self.__make_metadata_fname(path) tf.io.write_file(metadata_fname, json.dumps(metadata)) - self.kv_store.save(path, compression=compression) - self.search.save(path) + os.mkdir(Path(path) / "store") + os.mkdir(Path(path) / "search") + self.kv_store.save(Path(path) / "store", compression=compression) + self.search.save(Path(path) / "search") @staticmethod def load(path: str | Path, verbose: int = 1): @@ -420,7 +426,7 @@ def load(path: str | Path, verbose: int = 1): distance=md["distance"], embedding_size=md["embedding_size"], embedding_output=md["embedding_output"], - kv_store=md["kv_store"], + kv_store=kv_store, evaluator=md["evaluator"], search=search, stat_buffer_size=md["stat_buffer_size"], @@ -429,12 +435,12 @@ def load(path: str | Path, verbose: int = 1): # reload the key value store if verbose: print("Loading index data") - index.kv_store.load(path) + index.kv_store.load(Path(path) / "store") # rebuild the index if verbose: print("Loading search index") - index.search.load(path) + index.search.load(Path(path) / "search") # reload calibration data if any index.is_calibrated = md["is_calibrated"] diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index 11f1584e..f1241dd8 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -180,7 +180,7 @@ def batch_add( if self.algo != "flat": # flat does not accept indexes as parameters and assumes incremental # indexes - self.index.add_with_ids(embeddings, idxs) + self.index.add_with_ids(embeddings, np.array(idxs)) else: self.index.add(embeddings) diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index a467a086..2afcdf80 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -31,12 +31,12 @@ class CachedStore(Store): """Efficient cached dataset store""" - def __init__(self, shard_size=1000000, path=".", **kw_args) -> None: + def __init__(self, shard_size=1000000, path=".", num_items=0, **kw_args) -> None: # We are using a native python cached dictionary # db[id] = pickle((embedding, label, data)) self.db: list[dict[str, str]] = [] self.shard_size = shard_size - self.num_items: int = 0 + self.num_items: int = num_items self.path: str = path def __get_shard_file_path(self, shard_no): @@ -110,6 +110,7 @@ def batch_add( self.__add_new_shard() self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, rec_data)) idxs.append(idx) + self.num_items += len(embeddings) return idxs @@ -173,7 +174,8 @@ def __set_config(self, num_items, shard_size, **kw_args): def __load_config(self, path): with open(self.__make_config_file_path(path), "rt") as f: - self.__set_config(**json.load(f)) + config = json.load(f) + self.__set_config(**config) def save(self, path: str, compression: bool = True) -> None: """Serializes index on disk. diff --git a/tests/test_indexer.py b/tests/test_indexer.py index 2ca33d80..a89dd12d 100644 --- a/tests/test_indexer.py +++ b/tests/test_indexer.py @@ -1,6 +1,8 @@ import numpy as np from tensorflow_similarity.indexer import Indexer +from tensorflow_similarity.search import FaissSearch, LinearSearch +from tensorflow_similarity.stores import CachedStore from . import DATA_DIR @@ -129,6 +131,45 @@ def test_uncompress_reload(tmp_path): assert indexer2.size() == 2 +def test_linear_search_reload(tmp_path): + "Ensure the save and load of custom search and store work" + embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") + search = LinearSearch("cosine", 3) + store = CachedStore() + + indexer = Indexer(3, search=search, kv_store=store) + indexer.batch_add(embs, verbose=0) + assert indexer.size() == 2 + + # save + path = tmp_path / "test_save_and_add/" + indexer.save(path, compression=False) + + # reload + indexer2 = Indexer.load(path) + assert indexer2.size() == 2 + + +def test_faiss_search_reload(tmp_path): + "Ensure the save and load of Faiss search and store work" + embs = np.random.random((1024, 8)).astype(np.float32) + search = FaissSearch("cosine", 8, m=4, nlist=2) + store = CachedStore() + + indexer = Indexer(8, search=search, kv_store=store) + indexer.build_index(embs) + indexer.batch_add(embs, verbose=0) + assert indexer.size() == 1024 + + # save + path = tmp_path / "test_save_and_add/" + indexer.save(path, compression=False) + + # reload + indexer2 = Indexer.load(path) + assert indexer2.size() == 1024 + + def test_index_reset(): prediction = np.array([[1, 1, 2]], dtype="float32") From 077d920c89c9be10a97615c634c5118853415c0c Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 6 Mar 2023 23:34:04 -0800 Subject: [PATCH 27/35] pass str for path --- tensorflow_similarity/indexer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 740fc2b4..569c5e26 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -399,8 +399,8 @@ def save(self, path: str, compression: bool = True): os.mkdir(Path(path) / "store") os.mkdir(Path(path) / "search") - self.kv_store.save(Path(path) / "store", compression=compression) - self.search.save(Path(path) / "search") + self.kv_store.save(str(Path(path) / "store"), compression=compression) + self.search.save(str(Path(path) / "search")) @staticmethod def load(path: str | Path, verbose: int = 1): @@ -435,12 +435,12 @@ def load(path: str | Path, verbose: int = 1): # reload the key value store if verbose: print("Loading index data") - index.kv_store.load(Path(path) / "store") + index.kv_store.load(str(Path(path) / "store")) # rebuild the index if verbose: print("Loading search index") - index.search.load(Path(path) / "search") + index.search.load(str(Path(path) / "search")) # reload calibration data if any index.is_calibrated = md["is_calibrated"] From 691df9cbf49451899b789199b96957e4e74fbc25 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Wed, 8 Mar 2023 22:45:31 -0800 Subject: [PATCH 28/35] support more distances for LinearSearch --- tensorflow_similarity/search/linear_search.py | 45 ++++++++++++++----- tests/search/test_linear_search.py | 28 ++++++++++++ 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index e0403b22..75e9323d 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -66,12 +66,38 @@ def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[i k: Number of nearest neighboors embedding to lookup. Defaults to 5. """ - normalized_query = tf.math.l2_normalize(embeddings, axis=1) items = len(self.ids) - sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) - similarity, id_idxs = tf.math.top_k(sims, k) - ids_array = np.array(self.ids) - return list(np.array([ids_array[x.numpy()] for x in id_idxs])), list(similarity) + if self.distance.name == "cosine": + normalized_query = tf.math.l2_normalize(embeddings, axis=1) + sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) + similarity, id_idxs = tf.math.top_k(sims, k) + ids_array = np.array(self.ids) + return list(np.array([ids_array[x.numpy()] for x in id_idxs])), list(similarity) + elif self.distance.name in ("euclidean", "squared_euclidean"): + normalized_query = tf.math.l2_normalize(embeddings, axis=1) + items = len(self.ids) + assert ( + normalized_query.shape.as_list()[-1] == self.db.shape[-1] + ), "the last dimension should have the same size" + query_norms = tf.reduce_sum(tf.square(normalized_query), axis=1) + query_norms = tf.reshape(query_norms, [-1, 1]) # Only one column per row + + db_norms = tf.reduce_sum(tf.square(self.db[:items]), axis=1) + db_norms = tf.reshape(db_norms, [-1, 1]) # Only one column per row + + dists = query_norms - 2 * tf.matmul(normalized_query, tf.transpose(self.db[:items])) + db_norms + dists, id_idxs = tf.math.top_k(-dists, k) + dists = -dists + ids_array = np.array(self.ids) + return list(np.array([ids_array[x.numpy()] for x in id_idxs])), list(dists) + elif self.distance.name == "manhattan": + dists = tf.reduce_sum(tf.abs(tf.subtract(self.db[:items], tf.expand_dims(embeddings, 1))), axis=2) + dists, id_idxs = tf.math.top_k(-dists, k) + dists = -dists + ids_array = np.array(self.ids) + return list(np.array([ids_array[x.numpy()] for x in id_idxs])), list(dists) + else: + raise ValueError("Unsupported metric space") def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[float]]: """Find embedding K nearest neighboors embeddings. @@ -80,12 +106,9 @@ def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[fl embedding: Query embedding as predicted by the model. k: Number of nearest neighboors embedding to lookup. Defaults to 5. """ - normalized_query = tf.math.l2_normalize(np.array([embedding], dtype=np.float32), axis=1) - items = len(self.ids) - sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) - similarity, id_idxs = tf.math.top_k(sims, k) - ids_array = np.array(self.ids) - return list(np.array(ids_array[id_idxs[0].numpy()])), list(similarity[0]) + embeddings: FloatTensor = tf.convert_to_tensor([embedding], dtype=np.float32) + idxs, dists = self.batch_lookup(embeddings, k=k) + return idxs[0], dists[0] def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): """Add a single embedding to the search index. diff --git a/tests/search/test_linear_search.py b/tests/search/test_linear_search.py index 1f85121c..0a86a0b1 100644 --- a/tests/search/test_linear_search.py +++ b/tests/search/test_linear_search.py @@ -17,6 +17,34 @@ def test_index_match(): assert list(idxs) == [0, 1] +def test_index_match_l1(): + target = np.array([1, 1, 2], dtype="float32") + embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") + + search_index = LinearSearch("l1", 3) + search_index.add(embs[0], 0) + search_index.add(embs[1], 1) + + idxs, embs = search_index.lookup(target, k=2) + + assert len(embs) == 2 + assert list(idxs) == [1, 0] + + +def test_index_match_l2(): + target = np.array([1, 1, 2], dtype="float32") + embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") + + search_index = LinearSearch("l2", 3) + search_index.add(embs[0], 0) + search_index.add(embs[1], 1) + + idxs, embs = search_index.lookup(target, k=2) + + assert len(embs) == 2 + assert list(idxs) == [0, 1] + + def test_index_save(tmp_path): target = np.array([1, 1, 2], dtype="float32") embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") From 09aa1cc0d4161bf1f67e279d3cb8104a698dddc1 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Thu, 9 Mar 2023 11:05:37 -0800 Subject: [PATCH 29/35] add indexing colab --- examples/indexing_colab.ipynb | 2746 +++++++++++++++++++++++++++++++++ 1 file changed, 2746 insertions(+) create mode 100644 examples/indexing_colab.ipynb diff --git a/examples/indexing_colab.ipynb b/examples/indexing_colab.ipynb new file mode 100644 index 00000000..6f6d3e06 --- /dev/null +++ b/examples/indexing_colab.ipynb @@ -0,0 +1,2746 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "collapsed_sections": [ + "ePmNIj8hSVAn" + ] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "9dffbdfbc552434ebcc3f480daee4bd9": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_15445b1000d94eea943c0f2db61f3de1", + "IPY_MODEL_b81c53fd06c24652affa33c7e5b95af3", + "IPY_MODEL_36894b6f420e41b196c94b5bbedc2552" + ], + "layout": "IPY_MODEL_a22fcd57348e4b9b9b537c461b7240d2" + } + }, + "15445b1000d94eea943c0f2db61f3de1": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_e501f796a3a649ef9f2fccb9017279b3", + "placeholder": "​", + "style": "IPY_MODEL_b900825d8731446a8dae9299ecf5c1a3", + "value": "filtering examples: 100%" + } + }, + "b81c53fd06c24652affa33c7e5b95af3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_003a9d6fd5a34026969972b568460f4b", + "max": 60000, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_d87efba0bf1f419d8f916a04d50b2057", + "value": 60000 + } + }, + "36894b6f420e41b196c94b5bbedc2552": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3d4ba235da194b728ba6350e86d3b2d1", + "placeholder": "​", + "style": "IPY_MODEL_ee45e68a2a7f43c7b6eadddb5634eed5", + "value": " 60000/60000 [00:00<00:00, 823941.96it/s]" + } + }, + "a22fcd57348e4b9b9b537c461b7240d2": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "e501f796a3a649ef9f2fccb9017279b3": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b900825d8731446a8dae9299ecf5c1a3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "003a9d6fd5a34026969972b568460f4b": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d87efba0bf1f419d8f916a04d50b2057": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "3d4ba235da194b728ba6350e86d3b2d1": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "ee45e68a2a7f43c7b6eadddb5634eed5": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "7437216a87894cb1b15f3a1e190c8684": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_fc4b40f05f1b44f3b8fb924f7e56390d", + "IPY_MODEL_3752a74a573947e797533665e85750f0", + "IPY_MODEL_3a6c2ea5aea84cc29808825a0cde0f1b" + ], + "layout": "IPY_MODEL_6968ab9dba0d492f8a53db348595af10" + } + }, + "fc4b40f05f1b44f3b8fb924f7e56390d": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_b3de8ed0b9ba4787ab8a65ad34a8b396", + "placeholder": "​", + "style": "IPY_MODEL_d7560023f385471989ebb30475f76e02", + "value": "selecting classes: 100%" + } + }, + "3752a74a573947e797533665e85750f0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_afafac0b0078453fb3e72264bf54ad40", + "max": 6, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_02ec015a1aa24dffab063edfeb453998", + "value": 6 + } + }, + "3a6c2ea5aea84cc29808825a0cde0f1b": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_69ee26e15b064e90be44c0fcfc89e778", + "placeholder": "​", + "style": "IPY_MODEL_89c3ea106b5442cb96d77c658cfb35be", + "value": " 6/6 [00:00<00:00, 298.71it/s]" + } + }, + "6968ab9dba0d492f8a53db348595af10": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "b3de8ed0b9ba4787ab8a65ad34a8b396": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "d7560023f385471989ebb30475f76e02": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "afafac0b0078453fb3e72264bf54ad40": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "02ec015a1aa24dffab063edfeb453998": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "69ee26e15b064e90be44c0fcfc89e778": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "89c3ea106b5442cb96d77c658cfb35be": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "5838c303535a4d119bb20c72c2a8d4b0": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_ae0a4b489c30469da7554a4703c2ba2c", + "IPY_MODEL_6372c74bb16e4bc18a4fb35dcfb58e69", + "IPY_MODEL_8b3fd08c655a44d9a7f37fd73f756370" + ], + "layout": "IPY_MODEL_a43064e7c0234afdbf6ed7cb7b67b426" + } + }, + "ae0a4b489c30469da7554a4703c2ba2c": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_38802fe54df5428ba519218fc8e43d33", + "placeholder": "​", + "style": "IPY_MODEL_81c40b1e7bc04ff5b45848d534a7eb66", + "value": "gather examples: 100%" + } + }, + "6372c74bb16e4bc18a4fb35dcfb58e69": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_266059b4dae84e918ff61474da0b05c8", + "max": 36963, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_4a27079fb25744e89461b92cb6f89de3", + "value": 36963 + } + }, + "8b3fd08c655a44d9a7f37fd73f756370": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_44dee7908f0d49669921f173a90ec536", + "placeholder": "​", + "style": "IPY_MODEL_64ba102445024f1fb9205990c457aa50", + "value": " 36963/36963 [00:00<00:00, 549257.81it/s]" + } + }, + "a43064e7c0234afdbf6ed7cb7b67b426": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "38802fe54df5428ba519218fc8e43d33": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "81c40b1e7bc04ff5b45848d534a7eb66": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "266059b4dae84e918ff61474da0b05c8": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "4a27079fb25744e89461b92cb6f89de3": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "44dee7908f0d49669921f173a90ec536": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "64ba102445024f1fb9205990c457aa50": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "2ba94ac719dc4d7ba5ab2e98661ef0ed": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HBoxModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_b50870fb01d842158e43283d006f9949", + "IPY_MODEL_c805692f6fee406ebec95a28b31573d6", + "IPY_MODEL_68ee51abad1344408cf94aa6cd510ff8" + ], + "layout": "IPY_MODEL_9ba3187dc1354099b37e847479769fee" + } + }, + "b50870fb01d842158e43283d006f9949": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_f0629dd648ad4d6e8bf64e4ff908c183", + "placeholder": "​", + "style": "IPY_MODEL_2a3207a4dbf449a3b528a2118ac492cc", + "value": "indexing classes: 100%" + } + }, + "c805692f6fee406ebec95a28b31573d6": { + "model_module": "@jupyter-widgets/controls", + "model_name": "FloatProgressModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_c859c8774c5c4a2087bba61a15795226", + "max": 36963, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cf28890e93e1424bbb6db38b0659b1a7", + "value": 36963 + } + }, + "68ee51abad1344408cf94aa6cd510ff8": { + "model_module": "@jupyter-widgets/controls", + "model_name": "HTMLModel", + "model_module_version": "1.5.0", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_7458b09153b64d91afd947bc4e613e57", + "placeholder": "​", + "style": "IPY_MODEL_fecbab879514406db7cd3452a3d4ad07", + "value": " 36963/36963 [00:00<00:00, 683225.26it/s]" + } + }, + "9ba3187dc1354099b37e847479769fee": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "f0629dd648ad4d6e8bf64e4ff908c183": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "2a3207a4dbf449a3b528a2118ac492cc": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + }, + "c859c8774c5c4a2087bba61a15795226": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "cf28890e93e1424bbb6db38b0659b1a7": { + "model_module": "@jupyter-widgets/controls", + "model_name": "ProgressStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7458b09153b64d91afd947bc4e613e57": { + "model_module": "@jupyter-widgets/base", + "model_name": "LayoutModel", + "model_module_version": "1.2.0", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "fecbab879514406db7cd3452a3d4ad07": { + "model_module": "@jupyter-widgets/controls", + "model_name": "DescriptionStyleModel", + "model_module_version": "1.5.0", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } + } + } + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "**Introduction**\n", + "\n", + "This codelab walks you through how to use different Search and Store types for indexing embeddings for nearest neighbor lookups, both exact lookup and approximate lookups.\n", + "The Indexer uses two components to handle the indexing:\n", + "\n", + "\n", + "1. Search: The component that given an embedding looks up k-nearest-neighbors of it\n", + "2. Store: stores and retrievs the metadata associated with a given embedding\n", + "\n", + "\n", + "\n", + "The package currently supports the following NN algorithms (Search component):\n", + "\n", + "* LinearSearch\n", + "* nmslib\n", + "* Faiss\n", + "\n", + "It supports the following Stores:\n", + "\n", + "* MemoryStore: For small datasets that fit in the memory\n", + "* CachedStore: For medium size datasets that would fit in the memory and disk of the machine\n", + "* RedisStore: For larger datasets that would require a server to store and retrieve the metadata\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ], + "metadata": { + "id": "ePmNIj8hSVAn" + } + }, + { + "cell_type": "code", + "source": [ + "#@title install git repo's indexing branch\n", + "!git clone https://github.com/tensorflow/similarity.git && cd similarity && git checkout indexing && pip install .[dev] && cd ..\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "aeptpGNhGoj0", + "outputId": "5dfdbfce-3074-48cc-8aca-2348aa0f3875" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'similarity'...\n", + "remote: Enumerating objects: 7082, done.\u001b[K\n", + "remote: Counting objects: 100% (1243/1243), done.\u001b[K\n", + "remote: Compressing objects: 100% (371/371), done.\u001b[K\n", + "remote: Total 7082 (delta 954), reused 1071 (delta 862), pack-reused 5839\u001b[K\n", + "Receiving objects: 100% (7082/7082), 166.74 MiB | 17.24 MiB/s, done.\n", + "Resolving deltas: 100% (4420/4420), done.\n", + "Branch 'indexing' set up to track remote branch 'indexing' from 'origin'.\n", + "Switched to a new branch 'indexing'\n", + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Processing /content/similarity\n", + " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting umap-learn\n", + " Downloading umap-learn-0.5.3.tar.gz (88 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.2/88.2 KB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting nmslib\n", + " Downloading nmslib-2.1.1-cp38-cp38-manylinux2010_x86_64.whl (13.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.4/13.4 MB\u001b[0m \u001b[31m86.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: matplotlib in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (3.5.3)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (1.22.4)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (4.64.1)\n", + "Requirement already satisfied: Pillow in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (8.4.0)\n", + "Requirement already satisfied: tensorflow-datasets>=4.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (4.8.3)\n", + "Requirement already satisfied: bokeh in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (2.4.3)\n", + "Requirement already satisfied: tabulate in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (0.8.10)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (1.3.5)\n", + "Collecting distinctipy\n", + " Downloading distinctipy-1.2.2-py3-none-any.whl (25 kB)\n", + "Collecting mypy<=0.982\n", + " Downloading mypy-0.982-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.4 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.4/17.4 MB\u001b[0m \u001b[31m92.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting faiss-gpu\n", + " Downloading faiss_gpu-1.7.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.5/85.5 MB\u001b[0m \u001b[31m11.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting types-tabulate\n", + " Downloading types_tabulate-0.9.0.1-py3-none-any.whl (3.1 kB)\n", + "Collecting black\n", + " Downloading black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m86.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting twine\n", + " Downloading twine-4.0.2-py3-none-any.whl (36 kB)\n", + "Collecting pytype\n", + " Downloading pytype-2023.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m97.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting mkdocs-autorefs\n", + " Downloading mkdocs_autorefs-0.4.1-py3-none-any.whl (9.8 kB)\n", + "Collecting mkdocs-material\n", + " Downloading mkdocs_material-9.1.1-py3-none-any.whl (7.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m114.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pre-commit\n", + " Downloading pre_commit-3.1.1-py2.py3-none-any.whl (202 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m202.3/202.3 KB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting redis\n", + " Downloading redis-4.5.1-py3-none-any.whl (238 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.5/238.5 KB\u001b[0m \u001b[31m30.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (57.4.0)\n", + "Collecting mkdocstrings\n", + " Downloading mkdocstrings-0.20.0-py3-none-any.whl (26 kB)\n", + "Collecting types-termcolor\n", + " Downloading types_termcolor-1.1.6.1-py3-none-any.whl (2.4 kB)\n", + "Collecting types-redis\n", + " Downloading types_redis-4.5.1.4-py3-none-any.whl (55 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 KB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: wheel in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (0.38.4)\n", + "Collecting isort\n", + " Downloading isort-5.12.0-py3-none-any.whl (91 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m91.2/91.2 KB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting mkdocs\n", + " Downloading mkdocs-1.4.2-py3-none-any.whl (3.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/3.7 MB\u001b[0m \u001b[31m118.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting flake8\n", + " Downloading flake8-6.0.0-py2.py3-none-any.whl (57 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.8/57.8 KB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pytest in /usr/local/lib/python3.8/dist-packages (from tensorflow-similarity==0.17.0.dev18) (3.6.4)\n", + "Requirement already satisfied: tomli>=1.1.0 in /usr/local/lib/python3.8/dist-packages (from mypy<=0.982->tensorflow-similarity==0.17.0.dev18) (2.0.1)\n", + "Collecting mypy-extensions>=0.4.3\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Requirement already satisfied: typing-extensions>=3.10 in /usr/local/lib/python3.8/dist-packages (from mypy<=0.982->tensorflow-similarity==0.17.0.dev18) (4.5.0)\n", + "Requirement already satisfied: tensorflow-metadata in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (1.12.0)\n", + "Requirement already satisfied: promise in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (2.3)\n", + "Requirement already satisfied: toml in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (0.10.2)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (8.1.3)\n", + "Requirement already satisfied: wrapt in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (1.15.0)\n", + "Requirement already satisfied: absl-py in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (1.4.0)\n", + "Requirement already satisfied: protobuf>=3.12.2 in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (3.19.6)\n", + "Requirement already satisfied: dm-tree in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (0.1.8)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (5.4.8)\n", + "Requirement already satisfied: importlib-resources in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (5.12.0)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (2.25.1)\n", + "Requirement already satisfied: etils[enp,epath]>=0.9.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (1.0.0)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.8/dist-packages (from tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (2.2.0)\n", + "Requirement already satisfied: packaging>=22.0 in /usr/local/lib/python3.8/dist-packages (from black->tensorflow-similarity==0.17.0.dev18) (23.0)\n", + "Collecting pathspec>=0.9.0\n", + " Downloading pathspec-0.11.0-py3-none-any.whl (29 kB)\n", + "Requirement already satisfied: platformdirs>=2 in /usr/local/lib/python3.8/dist-packages (from black->tensorflow-similarity==0.17.0.dev18) (3.0.0)\n", + "Requirement already satisfied: Jinja2>=2.9 in /usr/local/lib/python3.8/dist-packages (from bokeh->tensorflow-similarity==0.17.0.dev18) (3.1.2)\n", + "Requirement already satisfied: tornado>=5.1 in /usr/local/lib/python3.8/dist-packages (from bokeh->tensorflow-similarity==0.17.0.dev18) (6.2)\n", + "Requirement already satisfied: PyYAML>=3.10 in /usr/local/lib/python3.8/dist-packages (from bokeh->tensorflow-similarity==0.17.0.dev18) (6.0)\n", + "Collecting pyflakes<3.1.0,>=3.0.0\n", + " Downloading pyflakes-3.0.1-py2.py3-none-any.whl (62 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.8/62.8 KB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting mccabe<0.8.0,>=0.7.0\n", + " Downloading mccabe-0.7.0-py2.py3-none-any.whl (7.3 kB)\n", + "Collecting pycodestyle<2.11.0,>=2.10.0\n", + " Downloading pycodestyle-2.10.0-py2.py3-none-any.whl (41 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.3/41.3 KB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.8/dist-packages (from matplotlib->tensorflow-similarity==0.17.0.dev18) (4.38.0)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->tensorflow-similarity==0.17.0.dev18) (1.4.4)\n", + "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.8/dist-packages (from matplotlib->tensorflow-similarity==0.17.0.dev18) (2.8.2)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.8/dist-packages (from matplotlib->tensorflow-similarity==0.17.0.dev18) (0.11.0)\n", + "Requirement already satisfied: pyparsing>=2.2.1 in /usr/local/lib/python3.8/dist-packages (from matplotlib->tensorflow-similarity==0.17.0.dev18) (3.0.9)\n", + "Collecting pyyaml-env-tag>=0.1\n", + " Downloading pyyaml_env_tag-0.1-py3-none-any.whl (3.9 kB)\n", + "Collecting watchdog>=2.0\n", + " Downloading watchdog-2.3.1-py3-none-manylinux2014_x86_64.whl (80 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.6/80.6 KB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting markdown<3.4,>=3.2.1\n", + " Downloading Markdown-3.3.7-py3-none-any.whl (97 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m97.8/97.8 KB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting mergedeep>=1.3.4\n", + " Downloading mergedeep-1.3.4-py3-none-any.whl (6.4 kB)\n", + "Collecting ghp-import>=1.0\n", + " Downloading ghp_import-2.1.0-py3-none-any.whl (11 kB)\n", + "Requirement already satisfied: importlib-metadata>=4.3 in /usr/local/lib/python3.8/dist-packages (from mkdocs->tensorflow-similarity==0.17.0.dev18) (6.0.0)\n", + "Collecting colorama>=0.4\n", + " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", + "Collecting mkdocs-material-extensions>=1.1\n", + " Downloading mkdocs_material_extensions-1.1.1-py3-none-any.whl (7.9 kB)\n", + "Collecting pymdown-extensions>=9.9.1\n", + " Downloading pymdown_extensions-9.10-py3-none-any.whl (235 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.5/235.5 KB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pygments>=2.14\n", + " Downloading Pygments-2.14.0-py3-none-any.whl (1.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m74.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: regex>=2022.4.24 in /usr/local/lib/python3.8/dist-packages (from mkdocs-material->tensorflow-similarity==0.17.0.dev18) (2022.6.2)\n", + "Collecting requests>=2.19.0\n", + " Downloading requests-2.28.2-py3-none-any.whl (62 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.8/62.8 KB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: MarkupSafe>=1.1 in /usr/local/lib/python3.8/dist-packages (from mkdocstrings->tensorflow-similarity==0.17.0.dev18) (2.1.2)\n", + "Collecting pybind11<2.6.2\n", + " Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.5/188.5 KB\u001b[0m \u001b[31m23.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->tensorflow-similarity==0.17.0.dev18) (2022.7.1)\n", + "Collecting identify>=1.0.0\n", + " Downloading identify-2.5.18-py2.py3-none-any.whl (98 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.8/98.8 KB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nodeenv>=0.11.1\n", + " Downloading nodeenv-1.7.0-py2.py3-none-any.whl (21 kB)\n", + "Collecting virtualenv>=20.10.0\n", + " Downloading virtualenv-20.20.0-py3-none-any.whl (8.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.7/8.7 MB\u001b[0m \u001b[31m128.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting cfgv>=2.0.0\n", + " Downloading cfgv-3.3.1-py2.py3-none-any.whl (7.3 kB)\n", + "Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.8/dist-packages (from pytest->tensorflow-similarity==0.17.0.dev18) (1.11.0)\n", + "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.8/dist-packages (from pytest->tensorflow-similarity==0.17.0.dev18) (22.2.0)\n", + "Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.8/dist-packages (from pytest->tensorflow-similarity==0.17.0.dev18) (9.1.0)\n", + "Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.8/dist-packages (from pytest->tensorflow-similarity==0.17.0.dev18) (0.7.1)\n", + "Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.8/dist-packages (from pytest->tensorflow-similarity==0.17.0.dev18) (1.4.1)\n", + "Requirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.8/dist-packages (from pytest->tensorflow-similarity==0.17.0.dev18) (1.15.0)\n", + "Collecting pydot>=1.4.2\n", + " Downloading pydot-1.4.2-py2.py3-none-any.whl (21 kB)\n", + "Collecting ninja>=1.10.0.post2\n", + " Downloading ninja-1.11.1-py2.py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (145 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m146.0/146.0 KB\u001b[0m \u001b[31m18.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting libcst>=0.4.9\n", + " Downloading libcst-0.4.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.8/2.8 MB\u001b[0m \u001b[31m76.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting importlab>=0.8\n", + " Downloading importlab-0.8-py2.py3-none-any.whl (21 kB)\n", + "Collecting networkx<2.8.4\n", + " Downloading networkx-2.8.3-py3-none-any.whl (2.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m71.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: async-timeout>=4.0.2 in /usr/local/lib/python3.8/dist-packages (from redis->tensorflow-similarity==0.17.0.dev18) (4.0.2)\n", + "Collecting rfc3986>=1.4.0\n", + " Downloading rfc3986-2.0.0-py2.py3-none-any.whl (31 kB)\n", + "Collecting readme-renderer>=35.0\n", + " Downloading readme_renderer-37.3-py3-none-any.whl (14 kB)\n", + "Collecting requests-toolbelt!=0.9.0,>=0.8.0\n", + " Downloading requests_toolbelt-0.10.1-py2.py3-none-any.whl (54 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.5/54.5 KB\u001b[0m \u001b[31m6.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting keyring>=15.1\n", + " Downloading keyring-23.13.1-py3-none-any.whl (37 kB)\n", + "Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.8/dist-packages (from twine->tensorflow-similarity==0.17.0.dev18) (1.26.14)\n", + "Collecting pkginfo>=1.8.1\n", + " Downloading pkginfo-1.9.6-py3-none-any.whl (30 kB)\n", + "Collecting rich>=12.0.0\n", + " Downloading rich-13.3.2-py3-none-any.whl (238 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m238.7/238.7 KB\u001b[0m \u001b[31m28.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting types-pyOpenSSL\n", + " Downloading types_pyOpenSSL-23.0.0.4-py3-none-any.whl (6.9 kB)\n", + "Collecting cryptography>=35.0.0\n", + " Downloading cryptography-39.0.2-cp36-abi3-manylinux_2_28_x86_64.whl (4.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.2/4.2 MB\u001b[0m \u001b[31m118.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.8/dist-packages (from umap-learn->tensorflow-similarity==0.17.0.dev18) (1.2.1)\n", + "Requirement already satisfied: scipy>=1.0 in /usr/local/lib/python3.8/dist-packages (from umap-learn->tensorflow-similarity==0.17.0.dev18) (1.10.1)\n", + "Requirement already satisfied: numba>=0.49 in /usr/local/lib/python3.8/dist-packages (from umap-learn->tensorflow-similarity==0.17.0.dev18) (0.56.4)\n", + "Collecting pynndescent>=0.5\n", + " Downloading pynndescent-0.5.8.tar.gz (1.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m77.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: cffi>=1.12 in /usr/local/lib/python3.8/dist-packages (from cryptography>=35.0.0->types-redis->tensorflow-similarity==0.17.0.dev18) (1.15.1)\n", + "Requirement already satisfied: zipp in /usr/local/lib/python3.8/dist-packages (from etils[enp,epath]>=0.9.0->tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (3.15.0)\n", + "Collecting jeepney>=0.4.2\n", + " Downloading jeepney-0.8.0-py3-none-any.whl (48 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.4/48.4 KB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting SecretStorage>=3.2\n", + " Downloading SecretStorage-3.3.3-py3-none-any.whl (15 kB)\n", + "Collecting jaraco.classes\n", + " Downloading jaraco.classes-3.2.3-py3-none-any.whl (6.0 kB)\n", + "Collecting typing-inspect>=0.4.0\n", + " Downloading typing_inspect-0.8.0-py3-none-any.whl (8.7 kB)\n", + "Requirement already satisfied: llvmlite<0.40,>=0.39.0dev0 in /usr/local/lib/python3.8/dist-packages (from numba>=0.49->umap-learn->tensorflow-similarity==0.17.0.dev18) (0.39.1)\n", + "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.8/dist-packages (from pynndescent>=0.5->umap-learn->tensorflow-similarity==0.17.0.dev18) (1.2.0)\n", + "Requirement already satisfied: docutils>=0.13.1 in /usr/local/lib/python3.8/dist-packages (from readme-renderer>=35.0->twine->tensorflow-similarity==0.17.0.dev18) (0.16)\n", + "Requirement already satisfied: bleach>=2.1.0 in /usr/local/lib/python3.8/dist-packages (from readme-renderer>=35.0->twine->tensorflow-similarity==0.17.0.dev18) (6.0.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (3.0.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.8/dist-packages (from requests>=2.19.0->tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (2022.12.7)\n", + "Collecting markdown-it-py<3.0.0,>=2.2.0\n", + " Downloading markdown_it_py-2.2.0-py3-none-any.whl (84 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.5/84.5 KB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from scikit-learn>=0.22->umap-learn->tensorflow-similarity==0.17.0.dev18) (3.1.0)\n", + "Collecting distlib<1,>=0.3.6\n", + " Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m468.5/468.5 KB\u001b[0m \u001b[31m44.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: filelock<4,>=3.4.1 in /usr/local/lib/python3.8/dist-packages (from virtualenv>=20.10.0->pre-commit->tensorflow-similarity==0.17.0.dev18) (3.9.0)\n", + "Requirement already satisfied: googleapis-common-protos<2,>=1.52.0 in /usr/local/lib/python3.8/dist-packages (from tensorflow-metadata->tensorflow-datasets>=4.2->tensorflow-similarity==0.17.0.dev18) (1.58.0)\n", + "Requirement already satisfied: webencodings in /usr/local/lib/python3.8/dist-packages (from bleach>=2.1.0->readme-renderer>=35.0->twine->tensorflow-similarity==0.17.0.dev18) (0.5.1)\n", + "Requirement already satisfied: pycparser in /usr/local/lib/python3.8/dist-packages (from cffi>=1.12->cryptography>=35.0.0->types-redis->tensorflow-similarity==0.17.0.dev18) (2.21)\n", + "Collecting mdurl~=0.1\n", + " Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n", + "Building wheels for collected packages: tensorflow-similarity, umap-learn, pynndescent\n", + " Building wheel for tensorflow-similarity (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for tensorflow-similarity: filename=tensorflow_similarity-0.17.0.dev18-py3-none-any.whl size=241562 sha256=446cc6a98f5235d8a0a757a6fdf62ae120f422aafac3483ac5d0e3a572c71efa\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-wujt_gjg/wheels/73/62/33/8ca1c2e61b184580b4b0caac916dda8778f0ca566e43e04ddf\n", + " Building wheel for umap-learn (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82829 sha256=4641ebf51eaec50dbb6752575e99cef1e5a8a68ce450422fdad4a40f66c1e75e\n", + " Stored in directory: /root/.cache/pip/wheels/a9/3a/67/06a8950e053725912e6a8c42c4a3a241410f6487b8402542ea\n", + " Building wheel for pynndescent (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pynndescent: filename=pynndescent-0.5.8-py3-none-any.whl size=55513 sha256=86a88c58d2e95ceae3ccba06dba8b2157f188314e41cb8e2655ac7c5f0575971\n", + " Stored in directory: /root/.cache/pip/wheels/1c/63/3a/29954bca1a27ba100ed8c27973a78cb71b43dc67aed62e80c3\n", + "Successfully built tensorflow-similarity umap-learn pynndescent\n", + "Installing collected packages: types-termcolor, types-tabulate, ninja, faiss-gpu, distlib, watchdog, virtualenv, rfc3986, requests, redis, pyyaml-env-tag, pygments, pyflakes, pydot, pycodestyle, pybind11, pkginfo, pathspec, nodeenv, networkx, mypy-extensions, mkdocs-material-extensions, mergedeep, mdurl, mccabe, jeepney, jaraco.classes, isort, identify, distinctipy, colorama, cfgv, typing-inspect, requests-toolbelt, readme-renderer, pre-commit, nmslib, mypy, markdown-it-py, markdown, importlab, ghp-import, flake8, cryptography, black, types-pyOpenSSL, SecretStorage, rich, pynndescent, pymdown-extensions, mkdocs, libcst, umap-learn, types-redis, pytype, mkdocs-material, mkdocs-autorefs, keyring, twine, tensorflow-similarity, mkdocstrings\n", + " Attempting uninstall: requests\n", + " Found existing installation: requests 2.25.1\n", + " Uninstalling requests-2.25.1:\n", + " Successfully uninstalled requests-2.25.1\n", + " Attempting uninstall: pygments\n", + " Found existing installation: Pygments 2.6.1\n", + " Uninstalling Pygments-2.6.1:\n", + " Successfully uninstalled Pygments-2.6.1\n", + " Attempting uninstall: pydot\n", + " Found existing installation: pydot 1.3.0\n", + " Uninstalling pydot-1.3.0:\n", + " Successfully uninstalled pydot-1.3.0\n", + " Attempting uninstall: networkx\n", + " Found existing installation: networkx 3.0\n", + " Uninstalling networkx-3.0:\n", + " Successfully uninstalled networkx-3.0\n", + " Attempting uninstall: markdown\n", + " Found existing installation: Markdown 3.4.1\n", + " Uninstalling Markdown-3.4.1:\n", + " Successfully uninstalled Markdown-3.4.1\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "ipython 7.9.0 requires jedi>=0.10, which is not installed.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed SecretStorage-3.3.3 black-23.1.0 cfgv-3.3.1 colorama-0.4.6 cryptography-39.0.2 distinctipy-1.2.2 distlib-0.3.6 faiss-gpu-1.7.2 flake8-6.0.0 ghp-import-2.1.0 identify-2.5.18 importlab-0.8 isort-5.12.0 jaraco.classes-3.2.3 jeepney-0.8.0 keyring-23.13.1 libcst-0.4.9 markdown-3.3.7 markdown-it-py-2.2.0 mccabe-0.7.0 mdurl-0.1.2 mergedeep-1.3.4 mkdocs-1.4.2 mkdocs-autorefs-0.4.1 mkdocs-material-9.1.1 mkdocs-material-extensions-1.1.1 mkdocstrings-0.20.0 mypy-0.982 mypy-extensions-1.0.0 networkx-2.8.3 ninja-1.11.1 nmslib-2.1.1 nodeenv-1.7.0 pathspec-0.11.0 pkginfo-1.9.6 pre-commit-3.1.1 pybind11-2.6.1 pycodestyle-2.10.0 pydot-1.4.2 pyflakes-3.0.1 pygments-2.14.0 pymdown-extensions-9.10 pynndescent-0.5.8 pytype-2023.3.2 pyyaml-env-tag-0.1 readme-renderer-37.3 redis-4.5.1 requests-2.28.2 requests-toolbelt-0.10.1 rfc3986-2.0.0 rich-13.3.2 tensorflow-similarity-0.17.0.dev18 twine-4.0.2 types-pyOpenSSL-23.0.0.4 types-redis-4.5.1.4 types-tabulate-0.9.0.1 types-termcolor-1.1.6.1 typing-inspect-0.8.0 umap-learn-0.5.3 virtualenv-20.20.0 watchdog-2.3.1\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title check if the package is installed successfully\n", + "!pip list | grep tensorflow" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RKo2xxOa_xQ7", + "outputId": "3998c4fa-5c2e-43cd-d847-89936f550625" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "tensorflow 2.11.0\n", + "tensorflow-datasets 4.8.3\n", + "tensorflow-estimator 2.11.0\n", + "tensorflow-gcs-config 2.11.0\n", + "tensorflow-hub 0.12.0\n", + "tensorflow-io-gcs-filesystem 0.31.0\n", + "tensorflow-metadata 1.12.0\n", + "tensorflow-probability 0.19.0\n", + "tensorflow-similarity 0.17.0.dev18\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import gc\n", + "import os\n", + "\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "from tabulate import tabulate\n", + "import tensorflow as tf\n", + "import tensorflow_similarity as tfsim # main package\n", + "\n", + "# INFO messages are not printed.\n", + "# This must be run before loading other modules.\n", + "os.environ[\"TF_CPP_MIN_LOG_LEVEL\"] = \"1\"" + ], + "metadata": { + "id": "83Q84nCUF0es" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@title allow gpu memory to grow\n", + "tfsim.utils.tf_cap_memory()\n" + ], + "metadata": { + "id": "ylwoAusEmNSs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@title Clear out any old model state.\n", + "gc.collect()\n", + "tf.keras.backend.clear_session()\n", + "print(\"TensorFlow:\", tf.__version__)\n", + "print(\"TensorFlow Similarity\", tfsim.__version__)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9rAWsA4qmQKp", + "outputId": "29b4da3b-e796-4235-d84d-1a9177d925d4" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "TensorFlow: 2.11.0\n", + "TensorFlow Similarity 0.17.0.dev18\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title load data\n", + "(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "gwpkWfVimcz8", + "outputId": "0ca61c36-b872-4390-e313-f1828ffc8250" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz\n", + "11490434/11490434 [==============================] - 0s 0us/step\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title the sampler\n", + "CLASSES = [2, 3, 1, 7, 9, 6, 8, 5, 0, 4]\n", + "NUM_CLASSES = 6 # @param {type: \"slider\", min: 1, max: 10}\n", + "CLASSES_PER_BATCH = NUM_CLASSES\n", + "EXAMPLES_PER_CLASS = 10 # @param {type:\"integer\"}\n", + "STEPS_PER_EPOCH = 1000 # @param {type:\"integer\"}\n", + "\n", + "sampler = tfsim.samplers.MultiShotMemorySampler(\n", + " x_train,\n", + " y_train,\n", + " classes_per_batch=CLASSES_PER_BATCH,\n", + " examples_per_class_per_batch=EXAMPLES_PER_CLASS,\n", + " class_list=CLASSES[:NUM_CLASSES], # Only use the first 6 classes for training.\n", + " steps_per_epoch=STEPS_PER_EPOCH,\n", + ")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 180, + "referenced_widgets": [ + "9dffbdfbc552434ebcc3f480daee4bd9", + "15445b1000d94eea943c0f2db61f3de1", + "b81c53fd06c24652affa33c7e5b95af3", + "36894b6f420e41b196c94b5bbedc2552", + "a22fcd57348e4b9b9b537c461b7240d2", + "e501f796a3a649ef9f2fccb9017279b3", + "b900825d8731446a8dae9299ecf5c1a3", + "003a9d6fd5a34026969972b568460f4b", + "d87efba0bf1f419d8f916a04d50b2057", + "3d4ba235da194b728ba6350e86d3b2d1", + "ee45e68a2a7f43c7b6eadddb5634eed5", + "7437216a87894cb1b15f3a1e190c8684", + "fc4b40f05f1b44f3b8fb924f7e56390d", + "3752a74a573947e797533665e85750f0", + "3a6c2ea5aea84cc29808825a0cde0f1b", + "6968ab9dba0d492f8a53db348595af10", + "b3de8ed0b9ba4787ab8a65ad34a8b396", + "d7560023f385471989ebb30475f76e02", + "afafac0b0078453fb3e72264bf54ad40", + "02ec015a1aa24dffab063edfeb453998", + "69ee26e15b064e90be44c0fcfc89e778", + "89c3ea106b5442cb96d77c658cfb35be", + "5838c303535a4d119bb20c72c2a8d4b0", + "ae0a4b489c30469da7554a4703c2ba2c", + "6372c74bb16e4bc18a4fb35dcfb58e69", + "8b3fd08c655a44d9a7f37fd73f756370", + "a43064e7c0234afdbf6ed7cb7b67b426", + "38802fe54df5428ba519218fc8e43d33", + "81c40b1e7bc04ff5b45848d534a7eb66", + "266059b4dae84e918ff61474da0b05c8", + "4a27079fb25744e89461b92cb6f89de3", + "44dee7908f0d49669921f173a90ec536", + "64ba102445024f1fb9205990c457aa50", + "2ba94ac719dc4d7ba5ab2e98661ef0ed", + "b50870fb01d842158e43283d006f9949", + "c805692f6fee406ebec95a28b31573d6", + "68ee51abad1344408cf94aa6cd510ff8", + "9ba3187dc1354099b37e847479769fee", + "f0629dd648ad4d6e8bf64e4ff908c183", + "2a3207a4dbf449a3b528a2118ac492cc", + "c859c8774c5c4a2087bba61a15795226", + "cf28890e93e1424bbb6db38b0659b1a7", + "7458b09153b64d91afd947bc4e613e57", + "fecbab879514406db7cd3452a3d4ad07" + ] + }, + "id": "AMtypckSmigX", + "outputId": "14e1f114-c68e-474f-f8fa-b74cfe560070" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "The initial batch size is 60 (6 classes * 10 examples per class) with 0 augmentations\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "filtering examples: 0%| | 0/60000 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title make index\n", + "x_index, y_index = tfsim.samplers.select_examples(x_train, y_train, CLASSES, 20)\n", + "model.reset_index()\n", + "model.index(x_index, y_index, data=x_index)" + ], + "metadata": { + "id": "LypwRy-LnBgD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#@title NN lookup results\n", + "# re-run to test on other examples\n", + "num_neighbors = 5\n", + "\n", + "# select\n", + "x_display, y_display = tfsim.samplers.select_examples(x_test, y_test, CLASSES, 1)\n", + "\n", + "# lookup nearest neighbors in the index\n", + "nns = model.lookup(x_display, k=num_neighbors)\n", + "\n", + "# display\n", + "for idx in np.argsort(y_display):\n", + " tfsim.visualization.viz_neigbors_imgs(x_display[idx], y_display[idx], nns[idx], fig_size=(16, 2), cmap=\"Greys\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "1f70394ab6a64358be4b03a75aaf58d1", + "89cbe354b3024e3d838df344521415aa", + "fa1b6f54f0544ed5becef2b513f4b9ec", + "514cdca68e1b4eb4b717b7e7d24c209f" + ] + }, + "id": "AQyO36ZdnD6J", + "outputId": "ca32378a-2146-4c05-b2d0-a851d865fe92" + }, + "execution_count": null, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1f70394ab6a64358be4b03a75aaf58d1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "filtering examples: 0%| | 0/10000 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3sAAACOCAYAAACIehHUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAAsTAAALEwEAmpwYAAAfaUlEQVR4nO3debwT1f3/8fcREBBlxwUFsX4rFK0LImpB9IGVWi2C1vbnWhSXqqC2bmjVuotKsVYF6oLWqlhRKlAqFEV9WEBrQRS1ltYiKFVQETdA2eb3x4R0Ph/vTW64yU3u3Nfz8cjDeWcmkxNynOTczGdOiKJIAAAAAIB02aLcDQAAAAAAFB+DPQAAAABIIQZ7AAAAAJBCDPYAAAAAIIUY7AEAAABACjHYAwAAAIAUYrAHAAAAACnUYAd7IYSmIYRxIYQlIYTPQwivhBC+X+52obhK8T6HEPYOIcwLIazO/HfvHNt2CSE8GUJYGUJYFkK4M4TQOLE+CiGsCiF8kbndm1g3LXH/FyGEtSGE1xLrvxNCeCnzuhaEEPq45+4QQhgfQvg08/wP1+Z1o2qV3MdCCLuFECaHED4MIXwcQvhLCKFr4rGDM/v/LISwNIRwi+ufz4UQvkz0wYWJdUeGEGaFED7JPO+9IYRtavO6UbVK7mP59pVp+29DCMszffBPIYQda/q6QginhxDeyvS/6SGEjrV53ahaPe9jPw8hLMocx94LIfzaPXZxCGFN4jg2o5o2zAzxZ3Ljqtajdiqgj33hbhtCCHck1ld7rAkhXB1CWOce/43E+rtDCAtDCBtDCKdU8bp/nembK0MIY0IITWrzugvVYAd7khpLelfSwZJaSbpC0oQQQpdyNgpFV9T3OYSwpaTJkh6S1EbSA5ImZ+6vyhhJH0jaQdLemXac47bZK4qirTO30zfdGUXR9xP3by1pjqTHMu1oK+lPkkZKai3pFkl/CiG0Sez3j5KWSeosaVtJv9qc14y8KrmPtZY0RVJXSdtJeimz7022kvQzSe0l7S/pUEkXuf0PS/TDron7W0m6XlJHSd+StKPi/ojiq9g+VoN9nS/pQEl7Ku4rKyVt+oKV83WFEA6RdKOkgZLaSnpb0iOb85qRV33uY1Mk9YiiqKWkPSTtJek8t/8BieNY/yrae6KkOv0C3gCVtY+571PbS1qj/32nOkT5jzWPJvcRRdGixLpXFffXl6t46ksl9VTcN3eT1EPxa687URRxy9wkLZD0w3K3g1vlvs+S+kv6r6SQuO8dSYdXs/2bko5I5JGS7krkSNL/1eB5u0jaIKlLJv9A0htum39JOi3RzsWSGpX737sh3iqpj7lt22b6XLtq1l8g6U+J/Jyk02vY7mMkvVbuf/uGcquUPpZvX5LGSrolse5ISQtr8roU/4FqdGJdx0z/3bXc//4N4VZf+pjbTztJT0sak7hvsaTv5mhrq8zn5wGZ/tW43P/2DeVWl33MPXawpEWbHpvvWCPpakkP1WC/sySd4u6bK+lHiXyCpHfr8t+5If+yZ4QQtlM84n6j3G1B6RThfd5d0oIo839sxoLM/VW5TdJxIYStMqcufV/SdLfN85nTVv6Y4y9cP5H01yiKFifuC26boPgvR1L8obVQ0gMhhBUhhL+HEA7O8bpQJBXaxzbpK2lZFEUrcqz37R4RQvgohDA789fP6lT1WJRAhfWxfPsaJ6l3CKFjCGErSSdKmlbVk1TzukIVy3sIJVXP+phCCCeEED6T9JHiX/bucvt/OMSns88IIezl1t2o+I8Sy2r86lBrZehjSYMl/d49Nt+xZkCIT0V/I4RwdoFt9fveKYTQqsB9bDYGe5Iy584+LOmBKIr+We72oDSK9D5vLelTd9+nkqqrVXpe8YHnM0lLFf+FZ1Ji/cGKf7XrJuk9SVOrqRf4iaTfJfILkjqGEI4PITQJIQyWtKvi0/IkaSfFf/V6VvHpCqMUn97QPu8rxGar0D62qW07SRqt+Ne7rwkhDFF8qknydN/hkr6h+BTNuxWfKrxrFY89TPGH5y+raSOKpAL7WL59/VvxqVv/zTz+W5Ku9U9QzeuaLunHIYQ9QwjNFfevSP87zqEE6mEfUxRF46P4NM7dJP1W0vLEticq/pzdWfFn4l9CCK0lKYTQU1Jv/e/UYtSBMvWxTc+9s+LvXg8k7s53rJmg+NjVQdIZkn4ZQji+hu2cLun8EF9HYXv97xTjOjuONfjBXghhC0kPSloraViZm4MSqen7nPmLzabi24Oq2OQLSS3dfS0lfV7Nc05XXDvXQnFdVBtJN2/aJoqi56MoWhtF0SeKa1t2UXxASe6nj+IB2+OJx61QfG75BYo/1A5XfOrK0swmayQtjqJoXBRF66Io+oPiL1y9q3vtqJ1K7WOZ7TpImqH41Kav1TyFEAZJGiHp+1EUfbTp/iiK/hZF0edRFH0VRdEDkmZLOsI99gBJ4yUdG0XRv6p73ai9Cu1j+fY1WlJTxafXtcjsx/yyV93riqLoaUlXSZqo+FS8xZn9LhVKop72sawoiv6t+JeiMYn7ZkdRtCaKotVRFI2Q9ImkgzLPO0bS+VEUra/utaK4ytHHnJMlzYqi6O1Nd+Q71kRR9I8oit6LomhDFEVzJP1G0rF5nmeTGyTNl/SK4msvTJK0TvYPEqVVl+eMVtpN8U+p9yv+S0/zcreHW+W/z4p/LVsqe474ElVdO9Be8V+GWiXuGyTp9Wr23UjxwWtPd/89ik83yNWuxorPVf9eJp8maZHbZoGkgeV+P9J4q+Q+pvhL03xJN1XzfIdL+lBSrxq0bZqk8xJ5H8UXVRhQ7vcg7bdK7WP59iXp9eRxR/FFgyJJ7Qt9XYp/tVklqU2534803uprH6tifydJejVH296UdFSmL25UfPrmssxxMMosH1Tu9yONt3L1Mfe4f0kakmebnMcaxWe9/LGK+79Ws1fFNmdKeqFO/93L/caX86b4p/4XJW1d7rZwqx/vs6QtMweT8xX/tXpYJm9ZzfaLFF+JqXHmg+UJSeMz63ZXfNWxRopPR7hNcZ1dk8Tjmys+LaFfFfveR/HVw1pmHjs7sa6t4qveDc7s/1hJHyvzBYtbg+ljLRVfgfPOah7bT9IKSX2rWNda0vckNcvs+8TMh99umfV7KP7L5P8r979/Q7hVcB/LuS/FX+wmKr4IRhNJv5D035q8rkzf20PxF8TOii8YdGO534u03upxHztd0raZ5e6Kf9m7NZM7Kz6jZctMf7pY8aCuXaZfbZ+47ad4sLdjde3kVn/7WOYx38l8jm3j7s95rFF8JlWbzPpeik9LH+za0kzx2S9nZJa3yKzbUfEFX4Li6ym8K6l/nf67l/uNL2OH2znzP/WXin9N2XQ7sdxt41bZ77PiQdY8xadKvixpn8S6X0ialsh7Zw4aKxUXjk+QtF1mXT/Fg7tVin8dmSTpm+65js8cvEIV7XhE8UDwU0mPbvqwS6w/SNJrmdc7V/ylsiH2scGZtq1ybeucWf+spPVu3bTMug6S/q74VJZPFH9AH5Z43vsV/1U8+dg3Nvc1c6uffawG+2qnuDbng0w/mqXMr8j5XpfiL/0LMv13meJTjbnCMH3M7+t+xX94WqX49LuRkppl1u2e6EMrJM2U1LOa9nYRV+NMbR/L3HeXpAer2E/OY43i71srMu39pxJnuGTWP5d5bcnbIZl1fTP9crXi73x1Ps7YdMlRAAAAAECKNPgLtAAAAABAGjHYAwAAAIAUYrAHAAAAACnEYA8AAAAAUojBHgAAAACkUONCNm7fvn3UpUuXEjUFxbZ48WJ99NFHodztKAR9rH6hj6EuzJs376MoijqUux01RR+rf+hjKDX6GEqtuj5W0GCvS5cumjt3bvFahZLq2bNnuZtQMPpY/UIfQ10IISwpdxsKQR+rf+hjKDX6GEqtuj7GaZwAAAAAkEIM9gAAAAAghRjsAQAAAEAKMdgDAAAAgBRisAcAAAAAKcRgDwAAAABSiMEeAAAAAKQQgz0AAAAASCEGewAAAACQQo3L3QAAALB5Zs+ebfLVV19t8qxZs0w+9thjTR42bJjJ+++/f/EaBwAoO37ZAwAAAIAUYrAHAAAAACnEYA8AAAAAUqjB1uytXbs2u7xo0SKzbp999jH5yy+/NPmMM84weezYsSY3atSoGE1EPTN58mSTzz777Ozy+++/b9ZdeumlJg8fPtzk1q1bF7dxgJM8BkrSbbfdZnLz5s1NPuecc0zmOFcZhgwZYvLAgQNNvvzyy02+8847Te7du7fJJ510ksnJfsFxCcWwbt06k/v27ZtdfvHFF826m266yWT//atVq1Ymc1wCvo5f9gAAAAAghRjsAQAAAEAKpeY0zjVr1pj83nvvmTx+/HiTb7/99uzyypUrc+57iy3smPi+++4z2Z/u9Jvf/CZ3Y5EKy5YtM/mss84yefny5dnlEIJZd/PNN5v85ptvmjxp0qQitBANWRRFJvvT1X/5y1+a/Mgjj+Tc36mnnmry1ltvXYvWoVgOO+wwk6+77jqTmzZtavLBBx9s8ieffGLyT3/6U5OTUzHMmzfPrKMPoCr+2LN06VKTjz76aJPnz5+fXfaflSNHjsyZO3bsaHLjxvZr7Zw5c0xu1qxZdc1Givnva7n4PpKG09f5ZQ8AAAAAUojBHgAAAACkEIM9AAAAAEihiq3Z27hxo8nvvvuuyaNHjzZ54sSJJi9ZsqQ0DavCww8/bPKFF15ocufOneusLSidK6+80uRRo0aZ7KfoKMSf//xnk//xj3+Y3L17983eNxqGd955x2RfuzVu3Lha7d9fkt/X/PXo0aNW+8fm8VMp5ONrotq0aWOyr0lPTiHj3/Nbb721oOdGOvlrJrz00ksm9+vXb7P3ne+aCvnW9+nTx2Rfw7fllltuXsNQpxYuXGjyxx9/bLL//HvsscdMfuKJJ7LL/hjo+WOir3N+/PHHcze2AvHLHgAAAACkEIM9AAAAAEghBnsAAAAAkEIVU7Pn52GZOnWqycOGDavL5hgHHnigyS+++KLJfp4iP8cfNXv101NPPWXyiBEjTPZ1pbWxYcMGky+//HKTk+ebI71Wr15t8k033WTyPffcY/KKFSuyy74/FrN/StKUKVNM9sfoZ555xuS+ffsW9flRN/zceXfccUd2ea+99jLrzjvvPJO7dOlSsnahfNavX2/yggULTPZzPfrvRIXYe++9TR4yZIjJgwYNMtkfI8eOHWvyyy+/bLKfp89/1qJ4ktcx8N9x5s6da7K/ToGvOfd9au3atUVoYdV8Haif99jP7Thr1iyTv/GNb5SkXbXBL3sAAAAAkEIM9gAAAAAghRjsAQAAAEAKVUzN3s4772xyvnkwiumqq64yeeDAgSZ37drV5BYtWpS8TSi/oUOHmlxoDdTxxx+fXb7rrrvMOt/Hnn32WZOffPJJk19//XWT99hjj4Lagsr06aefmrznnnua7OcXLabtt9/e5EsuuSTn9n7uLF9H6o+j06dPzy43bdp0c5qICtC6devs8nHHHWfWXXvttSb7OfqQDhMmTDD55JNPNjmKIpPzfX/ztZ4nnHBCdnm//fYrqG2//vWvTZ4/f77JL7zwgsl+jmZq9ornt7/9rcnJurvly5cXtC/fp3bYYQeT/bU0PF/r6efOy8X3oXPPPddk/1p8DSs1ewAAAACAOsFgDwAAAABSiMEeAAAAAKRQxdTsldoPfvADk5P1Jb5OplGjRiZ/9dVXBT3XwoULTT7ggAMKejwqg59T6q233sq5vT9P+1e/+lV22c9dla+Gad26dSYX2gdRP0ycONHk2tToHXzwwSb7Ok/fB6dNm2Zyt27dCnq+I444wuRkjZ5k+yw1e+kwfPhwk3fddVeTqdlLp5kzZ5rs66latWplcv/+/U329bzdu3cvWtuaNGli8g9/+EOT58yZY7Kfn23NmjUmN2/evGhta2j8XHm56vT8/NODBw82uX379ib7OtGWLVtuThNrpEePHiaPGTPGZP86/VyOfi7ISsAvewAAAACQQgz2AAAAACCFGOwBAAAAQApVTM3exRdfbHKy3qkq7dq1M/lHP/qRyaeddprJfl4yf553LlOmTKnxtpKdlwj114MPPmiyr8nz53HPmDHDZD8vDOD5+UUPPfTQnNsn526UpKOOOiq73LZtW7PuggsuMPnMM880udAavY8++shk39+Rfr6P+bxkyRKTff9G/TRy5EiTd999d5MPP/xwk4tZk1coX0fq5/xbuXKlyb7+at999y1NwxqAm2++2eQbbrghu7zNNtvUdXM222effWbyBx98YLKvWfU1fpWIX/YAAAAAIIUY7AEAAABACjHYAwAAAIAUqpiaPT8Piz8H3M+F17ixbXop59yYOnVqQdtvt912JWoJ6lKzZs1M9uejX3TRRSa3adOm5G1CuvgavXw1e4W47bbbirYvSXr88cdN3rhxo8nHHHOMyS1atCjq86Py+Jo8P2+krxtF/ZSvHriS3HvvvTnX+2sqdO3atYStaVjSMkehn5PW939fv/7mm2+azDx7AAAAAIA6wWAPAAAAAFKIwR4AAAAApFDF1Oz5c30POeSQ8jREX58r6KGHHjLZz9vi5/zr1KlTaRqGitKhQ4cab7to0SKTZ82alXN7P9ePn9cIqGs33nhjzvXnnXeeyY0aNSplc1ABevfubfKHH35YppagoVq/fr3JX3zxRc7tmzZtarKvz0LD4+fNe/TRR03239/8Z9vRRx9dmoYVEb/sAQAAAEAKMdgDAAAAgBRisAcAAAAAKVQxNXuV5Msvvyxo+x122MHkHXfcsZjNQQq89tprJuerK+jVq5fJfs4/oNh83cLIkSNNfu+990zu2LGjyb5+C+nXv39/k4cOHWryiBEj6rI5aIBWrlxp8vPPP59ze18PD3z11VcmX3755SZv2LDB5LPOOsvkbt26laZhRcQvewAAAACQQgz2AAAAACCFOI1TX7+sqv8Jd+PGjSZvsYUdI/tL+QKStGbNmuzy1VdfnXPbVq1amXzxxReXokkosxUrVpj8zDPPmDx16lST3377bZO32mork0877bTs8pFHHmnWNWnSJGf2x7U777zT5EsvvVS5jBo1ymSmWmh4ZsyYYfKPf/zjMrUEleTdd981ee3atTm3b9OmTXa5bdu2ObdNfq5K0vXXX2+yPx3d5/vuuy/n/tHw+P7q+5h3yimnlLA1pcEvewAAAACQQgz2AAAAACCFGOwBAAAAQApRsyfpueeeM3nSpEkm+xo9X5ty6623lqJZqOfeeOON7PKrr76ac9s+ffqY3KVLl1I0CSW2dOlSk6+55hqT77//fpN93VyhfM1Ukq8ruP32200eO3asycOHD8/5XC1btjS5X79+NWgh0mzatGkmn3766WVqCUrJH6cWL15s8rXXXmvyhAkTTPaXtveSNXu+D/Xs2dPk5cuXm+xrjUMIJvtrMOSrCUT6+enVHn74YZM/+eQTkwcNGmRyfZxejV/2AAAAACCFGOwBAAAAQAox2AMAAACAFGqwNXvJc8ivvPLKgh677bbbmtytWzeT/Vxa22yzjclbbrllQc+H+mncuHHlbgJKzM/Pc8ABB5j8/vvv53z8gQceaPLQoUMLev5kzdQf/vAHs+53v/udyf/5z39Mnj17dkHP5WubO3ToUNDjUT+tX78+u/zZZ5+ZdUuWLDHZr3/66adN9rUuu+yyi8nNmjXb7HaidPx1CXx9r5/LztfNderUyWR/nHzssceyy7fcckvOfRXquuuuq9XjkT6+/44ePdpk3+f8/KE77LBDaRpWQvyyBwAAAAApxGAPAAAAAFKIwR4AAAAApFCDqdn78MMPTT7//POzy37elnyWLVtm8nbbbZdz+2HDhpl80UUX5dw+WePXqlWrgtqG0vFzs8ybN8/khx56yORHH320xvv++9//bvL8+fNN9rUtrVu3rvG+UTp+7jpfo7fvvvuaPGrUKJO/853vmNy4cWGH5BNOOCG77I8VY8aMMfmvf/1rQfv2cz362mSkw8KFC01O1k9J0vjx47PL//znP806X6vl5zzr3r27yWvWrDHZzz/q/3848sgjTe7fv7/JnTt3zi43bdrUrPPHa+oBq+fn0fM1epdddlnOxw8cONBkP+/ebrvtZnKTJk1MPvPMM7PL3/3ud3M3No9CPndRf61du9Zk/x1/ypQpJr/yyivVPtbz368OPfTQwhtYYfhlDwAAAABSiMEeAAAAAKQQgz0AAAAASKF6W7Pnz7n18/288847Jvt5Mj799NPSNKwKvo7BZ2/nnXfOLl9wwQVm3dlnn23yFlswXi+VK664wuS5c+eaPGPGjKI91wcffGCyr/Vq27atySeeeKLJJ510kslvvfWWyS+++KLJyfoWX19BbUvN+Xn2ttpqK5MnTZpksp9nrLb+9re/ZZfvueeeou578eLFJp9zzjkmN2/e3OSddtrJ5COOOCK77OcmrY/zFNVXvq5u1qxZJh999NEmX3PNNSaPHTs2u/yzn/3MrNtrr71Mvvvuu032c8r62rDkfLdS/mOsr5FNHjf9HH89evQwecCAASb7ud4aMl/f6Och80aMGGHyJZdcUqvnL/S6Cbm0adOmaPtC5fI1eccdd1zO7ZPHQT+P3je/+U2TJ0+ebHL79u03p4kVhZECAAAAAKQQgz0AAAAASCEGewAAAACQQvWmZs+fj+/ndSl0DqlKlqw/TM4HKH29vqJjx4510qaG4I033jDZzzXk6xoK4evgfB2Nr13xPv74Y5PvuOOOnLkQ3/72t00++eSTN3tfDY2f08nPk+fn/slXs+f7wezZs032c3Qm5z1bt26dWdeuXTuT/VxBvXr1MvnBBx/M2bYXXnjBZP98yflBJWnChAnZZV8flawDQ2l98cUXJvu563w9r58bb8OGDdnl1atXm3U33nijyb5Gz/M15r7u86CDDsqZURq+Htd/PnXq1MnkoUOHFrR/X5Pua85nzpxZ7XP7+T59/frLL79s8mGHHWby559/bnKLFi3yNxgVx38HOuOMM4q270MOOcRkPy9kofxn45NPPmmyP8b6msFS4Jc9AAAAAEghBnsAAAAAkEIM9gAAAAAghepNzd64ceNMLmWN3j777GOynyurtpYuXWpyrlqZBx54wOSJEyeafO655xavYQ3M66+/bvKll15qcm1q9CTpwAMPzC6PHj3arPPndC9btszkMWPGmDx//nyT/TyR+Wr+UBqDBw82+fe//73Jl112mcn+/9dVq1aZfMMNN5i8YMGCGrelUaNGJvtj5lFHHZXz8X5OT8/3WT9nWnLuRlQOPwdtnz59TP7Wt76V8/F/+ctfssvr168369Iw/xS+fpzx85DtvffeJvvvIc8880zO/b399tsm+2swJOcn/fnPf27W+WOor8Hz9Yb++9r3vvc9k6dNm2ayrzVGZXrkkUdM9n0on+Tnla8d9vOD+n3nq6nz39P9PN/eoEGDTPb/P5UCv+wBAAAAQAox2AMAAACAFGKwBwAAAAApVG9q9o455hiTL7744qLtu2fPniY//fTTJm+99dZFey7p63Nt7b///tVue/3115tMXUzxzJkzx2Q/F0qh+vfvb/KIESOyy77mIZ8BAwbkXP/KK6+Y/NZbbxW0/6Rc/Q+5+VrMp556yuRkvVNVubZuuumm7PKpp55q1nXo0KGoz9WkSZOi7g91Y5dddjHZz0vma8b9XJG/+MUvsstTpkwx6/LNq4d0mDp1as7s58bzNX/efvvtZ/K9996bXd5jjz1yPjZZ3ydJ999/v8l+blL/Oe8/p/0xu9jf97B51qxZY/Kzzz5rcr4+5iXn0vv3v/9t1vlrJvj5c718/d3n3r17m+xrBOsCv+wBAAAAQAox2AMAAACAFGKwBwAAAAApVG9q9op9HnVyLperrrqqpM9VG61bty53E1LLz+eTj5+bpV+/fiY/8cQTJjdv3nzzGlYDvgaw0JpAFEeLFi1M9vPkDRs2zOTVq1fn3F/Lli1NPuWUU0z2c0G2a9cuu0xNHari++hLL71ksq/19DXl06dPzy5369atyK1DJXj++edNPvPMM032dXC+HtjPuTlkyBCT/dzFvXr1Mrk2xy5/zFy8eLHJvsbP9/877rjD5OHDh5vsP/dRHjvttFNB2/u57JLz9Pl5ul999VWTR40alXPfvmbP92//PaBr164mN2vWLOf+S4FeDAAAAAApxGAPAAAAAFKIwR4AAAAApFC9qdlr27atyf6c8Pvuu8/k7t27m+zPy+7bt292udD5OpAOY8eONdnXR3kXXnihyTfffHOxm4R6zvchP6fTqlWrcj6+VatWJm+77bZFaRewSefOnU2eOXNmmVqCSuGvUzB+/HiT/XHL14FWEl8PNXnyZJN9LdcVV1xhsp931tfmo274ax4kv7NLUqdOnUz2c9n5OrpkXah/T30u9HoO9QG/7AEAAABACjHYAwAAAIAUqjencfpTLe++++6cGcjnJz/5Sc4M1FbHjh3L3QQAqJVKPm0znwEDBpi8YcOGMrUEtXHMMceUuwn1Gr/sAQAAAEAKMdgDAAAAgBRisAcAAAAAKcRgDwAAAABSiMEeAAAAAKQQgz0AAAAASCEGewAAAACQQgz2AAAAACCFGOwBAAAAQAox2AMAAACAFGKwBwAAAAApxGAPAAAAAFKIwR4AAAAApBCDPQAAAABIIQZ7AAAAAJBCIYqimm8cwoeSlpSuOSiynaMo6lDuRhSCPlbv0MdQF+pVP6OP1Uv0MZQafQylVmUfK2iwBwAAAACoHziNEwAAAABSiMEeAAAAAKQQgz0AAAAASCEGewAAAACQQgz2AAAAACCFGOwBAAAAQAox2AMAAACAFGKwBwAAAAApxGAPAAAAAFLo/wOAXrURSAaaRAAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3sAAACOCAYAAACIehHUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAAsTAAALEwEAmpwYAAAfPElEQVR4nO3deZRU1d3u8WczIyBIAEUUUCKKoETtiIojouLENb5xQK6KQ8ToVRCjV0VxAOKQKAoZHAKKMoVIrmgicQjgQF6HRlED6mtQGQwgkwwiyrDvH+fQqd+2u7qru6qr6vD9rFVr1eOuOmcXvT1Vu+r8znbeewEAAAAAkqVOvjsAAAAAAMg+JnsAAAAAkEBM9gAAAAAggZjsAQAAAEACMdkDAAAAgARisgcAAAAACcRkDwAAAAASaKee7DnnZjvnNjvnNsa3j/PdJ2Rftv/OzrkTnXMfOec2OedmOec6pHnsj5xzrznn1jnnljrnbqvqtpxz7Zxz051za+LnXpnS1so5N8c5t9o595Vz7r+dcz2DbV/nnFvunFvvnBvnnGtYk9eNiuVrjDnn2qfsc8fNO+euT3lMa+fcpHgMrnXOTUxpa+mc+2M8jlY55yY653aN29o45yY75/4dP3eOc65HynPbOueejdu9c65jTV4z0iviMXauc+4f8X5ml7P9Xs65d+Lj1KfOuStS2pxzbqhzbnHcPmXH+ET2FfEYaxi/x62P3/OGBNvfxTn3u/gYt84592pK2x3OuS3BvvetyetGxQp1jDnnjnfObQ/aL65Kvys7Tjnn7nPOLYnbFjnnbqnJa64W7/1Oe5M0W9Ll+e4Ht+L5O0tqJWmdpHMkNZL0K0lvpHn8AkkjJdWV1EnSMkl9q7ItSbMkPSipvqTuktZIOiFuayRpf0Vf2DhJZ8Xt9eL2UyStkNRV0m7xv8E9+f5bJPWWzzEWPHcfSdskdUz5b69JekBS83gsHZLS9jtJL0raNW5/WdIDcdu+koZIahuP3yskrZLUNG7fXdJVko6U5FP3yY0xltLWW9K5koZJmh1sq37cj4HxcezHkjZK6h63XyzpI0l7S2oqabqk8fn+WyT1VsRj7O64fTdJXSQtl9QnpX2CpCmSWsfHssNS2u6QNCHf//Y7y61Qx5ik4yUtrU6/KztOKfqs1iS+307SfEln1+a/+079yx5QDWdLmu+9/5P3frOiN4ruzrkDKnh8R0kTvffbvPcLJb2uaAKWdlvOuaaKDj4jvfdbvPfvSXpa0qWS5L3f7L3/2Hu/XdGHpG2K3uhaxtu+WNJY7/187/1aScMlDcjWPwJyKtMxluoiSa967z+XJOfcyYregG7w3q+Lx9K7KY/fR9Iz3vv13vt1kv6f4vHpvf/Ue/+A935ZPH4fldRA0RuXvPcrvPe/k/R2Nl40alWtjTHv/cve+6mS/l3Otloq+qLhKR95W9KHkg6M289UdBxb4r3fKOleSec553apxmtG7arN49jFkoZ779d67z+U9Jji97t4f30lXeG9Xxkfy+Zm5yUiz7I2xmoo7XEq/qz2dcrjt0v6YRb2W2VM9qS745/25zjnjs93Z5Az2fo7d5X03o4Q/w+8UP+ZwIUelHSRc66+c25/Rb+CvFyFbbn4P7v/bEpOUrfUjTvn3pe0WdKzkv7gvf+yvG3H93d3zv2gSq8S1ZGvMSYpOpVE0RvY+JT/fISkjyWNd9Gpmm87545Laf+tpDOcc7s553aT9F+SZlSw/R8pmuz9K+NXhGwpxjFWIe/9CkmTJV3inKvrnDtSUgdFX4qV7Ta431DSflXZPqqlqMZYfNxqq++/3+3Yz+GSFkm6M35dHzjn/ivY7ZkuKpeY75z7eaYvFBkrxDEmSW2ccyucc58550Y555pk0O+0xynn3E3OuY2SlkpqImlSZS8um3b2yd7/VXSqUjtJj0p6zjnXKb9dQg5k8+/cVNFpA6nWSWpWweP/Iumnkr5R9DP/2Pjb67Tb8t5vkDRH0m3OuUbOuUMVfRA332h77w9W9M34BbIfkMJt77hfUT9RM/kcYzscrejUyqdT/ttekk5WdErwHpLulzTdOdcqbn9H0QRudXzbpujUTiOuP3hK0p3xL4CofcU6xiozWdEpnt8qOhVvqPd+Sdz2N0mXO+c6OueaK/o3kILjILKmGMdY05Rtl7efvRR9SbpO0p6S/o+iSWOXuH2qolM/W0v6maRhzrl+VXh9qJ5CHWMfSfqRoi8Oekk6TNFpw1Xpd6XHKe/9PXG/DlX0Xlqr76M79WTPe/+m936D9/5b7/14RR+uT8t3v5Bdmfydg+Lc9uU8ZKOiyVWqXSVtKGdbLRUdBO5SdD753pJOcc5dVcVt9Vd0mt0SSb9XVHewtJzXt9l7P1nSTc657hVse8f97/UTNZevMRa4WNK0+DSSHb6R9Ln3fmx86tMUReNpx8V8pkr6H0VvQrsq+lZ0QtDfxpKeU1QLcXclfUCOFPEYq1B8utUURd+yN1D0jfyNzrnT44eMUzQZnK2ozmVW/N+/dxxEzRXpGNvxuPD9bkPKc7dIGuG9/857/4qicXRy/JoXeO//HZ/e+Q9JDyn6ghY5UKhjzHu/PB4L2733n0m6UdEX7FXpd5WOU/Gp6u8qGpN3VtLHrNqpJ3vl8LI/xSKZKvw7e++bptwWl/OQ+YouliJJin/m7xT/99C+krZ575/03m/13i9V9MFmxwEi7ba894u892d471t773soKkZ+K83rqh/v83vbju+v8N6vTvN8ZE9tjbEdj2msqEg9PC3l/bgvYd92+JGkR7z3X8dvfA8r5Y3XRVdwfUbRm9bAivaPvCiWMZZON0n/471/If6Q9bGkv0o6NX4d2733t3vvO3rv94r790V8Q+4V/BjzUU36Mn3//W7Hft4vr/sV9UF8DqxthTLGyutXujlSWb+rcZyqF/ez9vgCuDpPPm6SWii6YmGj+B++v6SvJXXOd9+4Fe7fWdGpHusUfePTSFEhbrlXf1L0DdNXik6xrKPo9JP/lvTLqmxL0aklzRR94/2/FV0JsXXcdoSiUxEaSGqs6LSBDZL2jNv7KLoi2YHxv8FMcTXOxI2xlOdcIOlzSS747y0lrVX0TWZdRd9Yr5HUKm6fJWlMPIYaKzqF8x9xW31Fv+g9o/gqr+Xst5Gi+gOv6MItjfL990jircjHWN14H1dKejW+Xz9u66To2/leij44dVJUE3pFyrY7xW0HSvrnjjZujLGUMXaPpFcUXaTsAEWTvz5xW/14TN0Wv66eit4rD4jb/1f8PKeovu8LSRfn+++RxFuBj7ETFNULO0VnYc2S9HhV+p3uOKXos9/AYIwtk3Rtrf7b5/uPn8dB11rRVeQ2KPpA/oakk/LdL26F/3dWdCnxjxT9FD9b9vLQD0t6OCX3ive/TtHk6zFJu1RxW4MlrYwPKq9LKklpO05RYfKG+E3vFUnHBv0comj5hfWSHpfUMN9/jyTe8j3G4v/2gqKr0ZW3rWMkfaDoQ3WppGNS2vZRNKFbHY+jv0naL2WMeUmb4ufuuKU+34e3fP89kngr8jE2oJxx8kRK+7mKPhxtUPQL8r2S6sRtnRVdmGOTootsDMn33yKptyIfYw0VnUq3XtF73pDguV0VfdH6taLlkH6S0jY5Pv5tjPtaqx/Cd6ZbIY8xRZ+XvoiPNUskjVZ0/YRK+53uOKVosvc3Re+vGxWVTdyiYLKZ65uLOwMAAAAASBBq9gAAAAAggZjsAQAAAEACMdkDAAAAgARisgcAAAAACcRkDwAAAAASqF4mD27VqpXv2LFjjrqCbPv888+1atWqoloclDFWXBhjqA1z585d5b1vne9+VBVjrPgwxpBrjDHkWkVjLKPJXseOHVVaWpq9XiGnSkpK8t2FjDHGigtjDLXBObco333IBGOs+DDGkGuMMeRaRWOM0zgBAAAAIIGY7AEAAABAAjHZAwAAAIAEYrIHAAAAAAnEZA8AAAAAEojJHgAAAAAkEJM9AAAAAEggJnsAAAAAkEBM9gAAAAAggZjsAQAAAEACMdkDAAAAgARisgcAAAAACcRkDwAAAAASqF6+OwAAqJqNGzea/N1335Xdf/fdd7O6r969e5vctm1bk5966imTjznmGJMbNGiQ1f6g+C1YsMDkRYsWpX38Qw89ZPILL7xg8kknnVR2/8UXX6xh71CIvvnmG5OXLFli8tSpUzPa3sMPP2zy+vXr02YgCfhlDwAAAAASiMkeAAAAACRQYk7j/Pbbb00ePXq0yRs2bDC5f//+Zffbt29v2rz3Jm/bti3ttjZt2mTyk08+afKcOXNMnjlzpsl16tg5d2lpadn9Qw45RCgMqafMSdLVV19t8ty5c02ePn162f1HHnkk7bZTT0eSpH333bc6XayyTz/9tOz+smXLTNupp55qcvPmzXPal51ZOKaGDx9u8ocffmjyypUrTU499rzzzjumLTyuZKpFixYmd+vWzeTwuFnT/aHwbd682eQ1a9akffwNN9xg8ssvv2zyqlWrMtp/OMYYc8mzcOFCk++55x6Tx40bl9X91a1b1+T58+eb3LVr16zuD7kRvpe+//771d5WOAdwzqV9/FtvvWVy+L49adIkk9euXWvyj3/847L706ZNM2177bVX+s5WEUdKAAAAAEggJnsAAAAAkEBM9gAAAAAggYq2Zm/Lli0mX3bZZSZPnjw57fNHjhxZdv+AAw4wbeH5uuHlzr/44osq97M8Ya3LfffdZzJ1eoUpvGTz448/bnI4bvbZZ58K28JzwO++++607ZU9vybtYdvnn39uMjV7ufPaa6+ZHNan5NKIESNMDv/OJSUlaTN2Dqm1oOFSCBMmTMjpvq+77jqTjz76aJP32GOPnO4fuREup5Ba037zzTebtvB6DB06dDD5/PPPN7lJkyYmX3755SaHy32EdaedOnWqqNsoIPfff7/Jo0aNMjm8FkEmMq3Zy1S4vdTrdIwZM8a03XvvvVnZJ7/sAQAAAEACMdkDAAAAgARisgcAAAAACVS0NXtLly41ubIavXQ++uijtO0NGzY0ubJ1V8L2vn37mnziiSea3KZNm8q6iDx48803TR48eLDJldXFpWvbfffd07aH2w7rSkNt27Y1OVwTrVevXib36NEj7fZQ/FJrRiXp7bffNrlZs2Yms2YZJGndunUmH3HEEWX3wzVnM7XbbruZvHz58rSPZ129ZAjXQHv22WdNvv766yt87o033mhyWGtcr15mH2Op8yxOM2bMMDms7azpsSlV+Jm8Z8+eWdu2JHXp0sXkww8/vOx+586ds7qvHThyAgAAAEACMdkDAAAAgARisgcAAAAACVQ0NXuffPKJyQcffHDax6fWGUjSH/7wh2rvu1GjRiaHtTBIhk2bNpl84YUXmhzW0YU5rMNLXZMqbOvevXtGfWOtu53TJZdcYvJf//pXk7/88ssKn3vTTTeZzBhCdaSrhQnXPLvyyitN/slPfmJy3bp1Tc603grFIax3HzJkiMlvvPFGhc8N6/dqc+1RFI6xY8ea/Itf/MLk8Lj085//3OSwzu64444rux/Wq4fC41Ljxo3Td7YI8MseAAAAACQQkz0AAAAASCAmewAAAACQQAV7wvz27dtNnjp1qsnhui2hQw891ORwXQsg1LRpU5MrW0cvrMObN2+eyayfiMqEa25+/fXXJl911VUmp1uXLKzn69OnTw17B6Q3bdo0kw855JA89QSF5Pbbbze5tLTU5DFjxph82mmnld3fa6+9ctcxFKzVq1ebPGjQIJO/+eYbk8Prdtx3330m77LLLlnsXfHjlz0AAAAASCAmewAAAACQQEz2AAAAACCBCrZm7+KLLzZ50qRJaR8frn3XuXNnky+99FKTBwwYUHY/XH8qrLVq27Zt2n2jOIVrL1a2jl4odR09iRo91Nzw4cNNHj9+vMl16lT8/Vy4tuj69etN3nXXXWvYOyTRW2+9ZfIpp5xS4WP79+9vcmXr3SKZwvr1hQsXmjxr1iyTt27davIee+xhcseOHbPXORSlRx991OSwRi9c6+755583mRq99PhlDwAAAAASiMkeAAAAACQQkz0AAAAASKCCrdn76quvMnr8Z599ZvLgwYPTPj6shUnVrFkzk8P6waFDh5rcunVrkyur9UJ+LFmyxOSBAweaHNYhhML2l156yeT33nuvyn056KCDTD7qqKNMDuurwrpSJNOHH35Y7ef+4Ac/MLlVq1Ym33HHHWmf37dvX5PbtWtX7b6geIRrooW1nqm2bNli8saNG01u2LChyY0aNaph71CINm3aZPL++++f0fOffPJJkzt16lR2v1u3bqatbt26GfYOxWjt2rVp28877zyTw3WOkR6/7AEAAABAAjHZAwAAAIAEYrIHAAAAAAlUsDV7Xbp0MXnu3LkmH3nkkWmf36BBA5M7dOhg8ieffFLhc2fOnGnyb37zm7T566+/Npk6heJQWW1lZe3jxo0zOazpS31+urby2sN1iO68806TL7/88rR9Q3E68MADTZ4+fXq1t7VmzRqTr7322rSPHzFihMnhcTA8hnKcK05hbfGrr75a5edOnTo1bT7zzDNNnjJlismMmWQI6+jat29v8uLFi9M+/7nnnqswn3rqqabttttuM7lHjx5V7ieS44knnjA5vAZDuM7sYYcdZnLqdRFatmxp2naGulB+2QMAAACABGKyBwAAAAAJxGQPAAAAABLIVba2WKqSkhJfWlqaw+78x7Zt20wO6+LCdciyKaxpCGtdXn/9dZPDNf1uvfVWk3fbbbfsdS4DJSUlKi0tLapF/3I5xsJzvDt27GhypnV1mbRne9uHHnqoyWHdzS677KLawBjLru+++87kOXPmVPm5vXv3NrlOnZp9l7d9+3aTBw0aZPIDDzxQo+1nwjk313tfUms7rKFCGmNvvvmmySeddJLJ4XtrNp111lkmh3U34Zq2+cQYq75wDP3pT38y+ZprrjE5XKcvnXC90HBN2kmTJpncuHHjKm+7tjHGKrZgwQKTjz32WJPDtbczmbuEwmPgqFGjTA7Xjazpe2ltqmiMFc8rAAAAAABUGZM9AAAAAEiggl16IbwUai5P2wx1797d5PDy0nvuuafJDz74oMmHH364yeedd172OodqC5czmDVrlskvvfRSbXbHWLFihcnhsg6hefPmmXzGGWeY/Je//MXk2jqtEzUTLhlzwgknVPm54anvlQlPPx8zZozJ4WmcM2bMMLk2T+NE9d19990mV3baZvjem3rcDE/lHTZsmMmbN282+ZlnnjF5/vz5JoeXS0dxatKkickDBgxIm8PTOFOPJRMnTjRt4ZgKl6MJ9x0+PjymojCFyw6tWrXK5A8++MDkcFmXRx55xOS1a9dWuK/ws163bt1MPv74400Ol1sLl4YrBvyyBwAAAAAJxGQPAAAAABKIyR4AAAAAJFDB1uwVkrDe6eyzzzb5z3/+s8k1uSQscqd+/fomh5f2DXM+jRgxwuShQ4eaHNb0zZ492+SxY8eaHF76GrjnnntMXr16tcnhJc3D5UCQDCUl9irdYS1nv379KnzuP//5T5OffPLJtPuaNm2aydTs7ZzCz1Spy1WFS1dt3LjR5M6dO5sc1ruvW7fO5NatW1e7nygcBx10UNo8cuRIk7ds2WJy6pJp4XFq/PjxJs+cOdPkrl27mhxeE+G0006rqNsFg1/2AAAAACCBmOwBAAAAQAIx2QMAAACABHKZ1JeVlJT40tLSHHanOITrvIQ1fC1atDB5yZIlJtfWmmclJSUqLS0tqkIbxlj5wjqEcE2a5cuXmxzWV23dujUn/WKMJUf4b9KjRw+T999/f5MXLFiQ8z7t4Jyb670vqfyRhaGQxli4rl54LAhrmTN5f9qwYYPJ1113ncmPP/64yQ0bNjQ5rI3JZw0fY6w4hK/5kksuMTkc33PmzDG5ZcuWuelYFTDGCkM47wnX9AvrmJcuXWpyu3btTA7/jdq0aVPTLlZbRWOMX/YAAAAAIIGY7AEAAABAAjHZAwAAAIAEYp29anjiiSfStp911lkm11aNHpKrefPmJof1U8uWLavN7iC2fv16k3/729+afMMNN5hcr17hHnLD+ikkQ5MmTXK27QkTJpgc1uiFwvdC1tlDpsJ6qrZt25r897//3eSwtvjoo4/OTcdQNMJrGoRrMYY1eIcffrjJixcvNnnixIkmh7XLhYBf9gAAAAAggZjsAQAAAEACMdkDAAAAgAQq3AKSArJt2zaTw3WLQmHty6ZNm0ymhg/ZFp6DHmbkxvbt200eNmyYybvuuqvJV199dc77VF0rV67MdxdQZN566618dwF5EB4r7rrrLpPvvPNOk3O5tt2IESNMfuWVV3K2L+wcwhq+c845x+T777/f5BkzZphMzR4AAAAAoFYw2QMAAACABGKyBwAAAAAJVLA1ex9//LHJ4bpitWn48OEmh+u4hKZPn24yNXrFIaytDOXz7zh69GiTw7oE773Jffv2zXmf8H1hDV+47li/fv1MzmUty3fffWfya6+9ZvL7779v8gMPPGBy+Fr69OmTxd6hUG3dutXkzz77rOz+wIEDTds777xjcqdOnUw+6KCDTL7++uuz0UXk2XHHHWdy+Hkt/Dtn8zi3aNEik0888USTw/GL4vDtt9+a/MILL5h8+umnm1y3bt2c96mqNmzYYHJ4nY9C6Cu/7AEAAABAAjHZAwAAAIAEYrIHAAAAAAlUMDV74XnWPXv2NLm0tNTkjh07Zm3f4bnCixcvNnnkyJFpn9+sWTOTP/nkE5MPPvjgGvQOteWDDz4wed999zU5lzV7Yb3gQw89ZPKtt95qcmXr6IV1psiNOnXs92WtWrUy+b333jP5lFNOMfmiiy4y+cILL0y7v3Ddvs2bN5fdD2v0wuPWgw8+mHbb4WsZNGiQyb/85S/TPh+1Y82aNSYvW7bM5K5du5oc1o+E9b233367yV9++aXJ48aNq7AvDRo0MHnIkCEmX3nllRU+F8Vj+fLlJqfWcZbnzTffNHnPPfdM+/jUY084PufPn2/y5MmTTQ7fOwcMGGDyEUcckXbfKAzh5+6f/vSnJi9cuNDkvffeO+d92mHjxo1p28PPjuHjmzdvnvU+ZYpf9gAAAAAggZjsAQAAAEACMdkDAAAAgAQqmJq9cA2osA5u9913z+r+Us//Pe2000zbv/71r7TPbdeuncnh+elt27atYe+QD+G5/eE6ZK1bt672ttetW2fy+PHjTR48eLDJYU1eWMcQeuSRR0zu1q1bhj1EdYQ1dOFx7JhjjjF53rx5JofrlIU1T6GwFnPGjBll919//XXTFtbgZSpcdw/5EdZLHXvssSb37t3b5HDdsTvuuMPkBQsWVLsv4Xq3M2fONHmPPfao9rZRuMK/a/gZKKzhu+CCCzLafur6jCtXrjRtX331lcmV1Uk//PDDJterVzAfc5HGfvvtZ3L4mShcs/Oxxx4zOfwc36RJk2r3JXyfDtfLDV1xxRUmF0KNXohf9gAAAAAggZjsAQAAAEACMdkDAAAAgAQqmJOZjzrqKJPD87YPPPBAk4cOHWpyZWugPf/88yZPmzat7H64PlUorJ8K16+iRi8Zfvazn5ncp08fk8NzxCuTusbamDFjTFu4llU4xsIc1qyG6/Cdc845GfUNudG5c2eTZ8+ebXK4plnqcagqbrvttmr1qzxhfeAtt9yStW0je959912Tw/WmwhzW71YmrEU++eSTTb733nvL7jdu3Ni0tWjRIqN9IRlmzZplck3XPQ7HcKrwvTCsWZ06darJ9evXr1FfUBjuuusukydMmGDy+eefb3I4B+jbt6/J5557boX7Csffr3/9a5PDOUL79u1NDtdBLkT8sgcAAAAACcRkDwAAAAASiMkeAAAAACRQwdTsNWzY0OTwPO3FixebPHDgwKztO6wHDNfrCM/1Peyww7K2bxSOyy67zOQpU6aYfPrpp5scrvcTroWXOobTtUnSPvvsY/KZZ55p8s0332xymzZthMLXpUsXk8P6knCdshUrVph80UUXVXlf4ZgZNGhQ2sf37NnT5AYNGlR5X6g9JSUlJof1v9dcc03a5//qV78yuWvXriZ36NDB5AMOOCDTLmInE66zt3r1apOfeeYZk59++mmTU9cHlez6jf379zdtV199tcnhunlNmzatvMMoOo0aNTI5vE7Btddea3J4HYTw81uY0wk/r4VjbuzYsSa3bNmyytvOF37ZAwAAAIAEYrIHAAAAAAlUMKdxhsLLTQ8bNszkP/7xjxltLzwVM/VUlRtvvNG0hZeXxs6hR48eJo8aNcrk3//+9ybPmzcv7fZST9WsbOmE8BTRypYSQTL06tUrbXu/fv1qqScoVOHSCFdddVXaDORaWMIQLsExYMCAtBnIVLi8VLi0wsSJE00OP5+lLnO0fPnyjPb94osvmnz88cdn9PxCwC97AAAAAJBATPYAAAAAIIGY7AEAAABAAhVszd4Pf/hDkydNmpQ2A9kWLsUQZgAAANSucLm2Sy+9NO3jR48encvuFDx+2QMAAACABGKyBwAAAAAJxGQPAAAAABKIyR4AAAAAJBCTPQAAAABIICZ7AAAAAJBATPYAAAAAIIGY7AEAAABAAjHZAwAAAIAEYrIHAAAAAAnEZA8AAAAAEsh576v+YOdWSlqUu+4gyzp471vnuxOZYIwVHcYYakNRjTPGWFFijCHXGGPItXLHWEaTPQAAAABAceA0TgAAAABIICZ7AAAAAJBATPYAAAAAIIGY7AEAAABAAjHZAwAAAIAEYrIHAAAAAAnEZA8AAAAAEojJHgAAAAAkEJM9AAAAAEig/w8zS+S9ZKuzAQAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ] + }, + { + "cell_type": "code", + "source": [ + "model.index_summary()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CQxUTBfPnUOa", + "outputId": "de0f815b-5e2d-436a-f07a-7177b9eed41d" + }, + "execution_count": null, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info]\n", + "------------------ ------------\n", + "distance cosine\n", + "key value store CachedStore\n", + "search algorithm LinearSearch\n", + "evaluator memory\n", + "index size 200\n", + "calibrated False\n", + "calibration_metric f1\n", + "embedding_output\n", + "------------------ ------------\n", + "\n", + "\n", + "\n", + "[Performance]\n", + "----------- -----------\n", + "num lookups 10\n", + "min 0.00716727\n", + "max 0.00716727\n", + "avg 0.00716727\n", + "median 0.00716727\n", + "stddev 0\n", + "----------- -----------\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title save the model and the index\n", + "save_path = \"models/hello_world\" # @param {type:\"string\"}\n", + "model.save(save_path, save_index=True)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GHbK9xObnWPh", + "outputId": "8b72c936-894d-4b80-892b-d386ed6ec2a3" + }, + "execution_count": null, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:absl:Found untraced functions such as _jit_compiled_convolution_op, _jit_compiled_convolution_op, _jit_compiled_convolution_op, _jit_compiled_convolution_op, _update_step_xla while saving (showing 5 of 5). These functions will not be directly callable after loading.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title reload the model\n", + "reloaded_model = tf.keras.models.load_model(\n", + " save_path,\n", + " custom_objects={\"SimilarityModel\": tfsim.models.SimilarityModel},\n", + ")\n", + "# reload the index\n", + "reloaded_model.load_index(save_path)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8n51hOGynYXv", + "outputId": "56c238c4-c80e-4c0d-d1e1-e05c15e3f6e3" + }, + "execution_count": null, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "Distance metric automatically set to cosine use the distance arg to override.\n", + "Loading index data\n", + "Loading search index\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title check the index is back\n", + "reloaded_model.index_summary()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BHTwTTY5nbJJ", + "outputId": "ffadf50c-f527-47eb-8e9b-65bf3ac3d351" + }, + "execution_count": null, + "outputs": [ + { + "metadata": { + "tags": null + }, + "name": "stdout", + "output_type": "stream", + "text": [ + "[Info]\n", + "------------------ ------------\n", + "distance cosine\n", + "key value store CachedStore\n", + "search algorithm LinearSearch\n", + "evaluator memory\n", + "index size 200\n", + "calibrated False\n", + "calibration_metric f1\n", + "embedding_output\n", + "------------------ ------------\n", + "\n", + "\n", + "\n", + "[Performance]\n", + "----------- -\n", + "num lookups 0\n", + "min 0\n", + "max 0\n", + "avg 0\n", + "median 0\n", + "stddev 0\n", + "----------- -\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "#@title re-run to test on other examples\n", + "num_neighbors = 5\n", + "\n", + "# select\n", + "x_display, y_display = tfsim.samplers.select_examples(x_test, y_test, CLASSES, 1)\n", + "\n", + "# lookup the nearest neighbors\n", + "nns = model.lookup(x_display, k=num_neighbors)\n", + "\n", + "# display\n", + "for idx in np.argsort(y_display):\n", + " tfsim.visualization.viz_neigbors_imgs(x_display[idx], y_display[idx], nns[idx], fig_size=(16, 2), cmap=\"Greys\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000, + "referenced_widgets": [ + "f530d120e10445ecb401b690461cc09c", + "862ae5e690c84720aa259ded368d3fef", + "bab9c3c5b3164d23a54f203574bc2bbe", + "0ad5111179e947ebbd0e6086be82596b" + ] + }, + "id": "JpR6WrCinfW4", + "outputId": "c8788f94-f01c-4ffc-a31e-8173243fd105" + }, + "execution_count": null, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f530d120e10445ecb401b690461cc09c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "filtering examples: 0%| | 0/10000 [00:00" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA3sAAACOCAYAAACIehHUAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAAsTAAALEwEAmpwYAAAWUklEQVR4nO3de5CU1ZnH8d+D4CpEkYu4QQRMUMAEBYUEjChVKsaoWS7qmsVbjAbl4robTW0wXpYlZI1uKgniuolBCESiJYOKRXBLwQiaqCBKoLwLKuuFEBQEEcJw9o/3nbHPsaenm+nL9Onvp6rL9+n3vO88zRzf6affc/qYc04AAAAAgLi0qXQCAAAAAIDio9gDAAAAgAhR7AEAAABAhCj2AAAAACBCFHsAAAAAECGKPQAAAACIEMUeAAAAAESopos9M5tkZivNbJeZza50PiiNYv+ezWygma0ys4/T/w7M0ba/mS01s61m9pqZjQ72n2pmL6XnWmZmvTL2zTaz3Wa2PeOxXz7HpvtPM7PnzGyHmW00s/Nb+tqRXRX3sfPN7Kl03+OF5GFm15nZWjP7yMzWm9l1LX3daFqF+1hvM1tsZh+Y2XtmdruZtU33DQ+uUdvNzJnZ2HT/pWZWH+wfke7r2cSx38uSw6x0X5+WvnZkV619LDjPY+m+thnPnWhmz6TXqjVmdlJwzKFmdk96Df3AzH7b0teO7FprH0v372dm08zsnbSvrDazQ7Kcx+tj+VzHzGxy+ndyW/r6TwrPW0o1XexJekfSNEmzKp0ISqpov2cz21/Sg5LmSeokaY6kB9Pnw7Zt07YPS+os6buS5pnZ0en+rpLqJN2Q7l8p6d7gND9xzn0u41Gfz7FmdoykeyRdL6mjpOMkrWrp60eTqrWPbZH0M0n/uQ95mKSL031flzTJzC5owUtHbhXpY6k7JG2S9HlJAyWdImmCJDnnlmdeoySdLWm7pCUZx/8xuI49nh77VnDsAEl7JS0I8j1J0hdb+rrRrGruYzKzcZLaBc91lrRI0q2SDpH0E0mLzKxTRrM6Se9J6impm6Tb9ulFIx+tso+l/l3SiZKGSTpY0kWSPgl+5mf6WHPXMTP7qpK/secqeT/2a0kLLePD+5JzztX8Q0nHm13pPHi0/t+zpJGS/k+SZTz3lqSvZ2n7ZSV/kDLb/q+k/0i3vyvpqYx9HSTtlNQvjWdLmtZEHs0de0/Dz+FBH2uqn2Q8f7mkx/c1j3TfLyTNqPTvIPZHuftYuu9FSd/IiG+V9D9NtL1b0t0Z8aWSVuSZ102SlgXPtZW0WtKxkpykPpX+HcT+qLY+lj7XUdIrkoam/aRt+vzZktYFbV+R9J2MPDdI2q/S/+619GhtfUxJsbhd0hdz/LysfSxLO+86JukfJT2TEXdIj/98uf69a/3OHlCoL0la49L/Y1Nr0ufzYUreoDec64WGHc65HZJeD841wcy2pMMTMoesNHfsUEkysz+b2btmNi/9hBOtX7n7WIvzMDOTNFzSujxzRGUV2sd+JukCM2tvZodLOlPBXRVJMrMOSj69nhPsGmRmm83sFTO7IXPoVMaxDXeKw2P/RdITzrk1ebwutB7l7mPTJf23kjt0nzksS9xwjRwq6WVJc8zsr2b2rJmd0uSrQmtSzD42QNIeSeemQzxfMbOJwfG5+pikJq9jv5e0n5l9Nb2bd5mk53Odp9go9oDCfE7S1uC5rZIOytL2ZSVDBq4zs3ZmNlLJsIH2eZ7rF5KOUjKs5AZJs83sa3ke20PJEISx6TkOlDQjj9eHyitnHytWHjcr+Xtydx7nReUV2i+eUPIGapukjUqGAz+Qpd0YSZsl/SE49stKrmNjJX1LUrb5nSdJOkzS/Q1PmNkRksZLujHXi0GrVLY+ZmaDJX1N2f/G/VFSdzP7VnqNvETJkOCGa2QPJXeIlkn6e0n/pWQoYNdmXh8qr5h9rIeSO3dHSzpSyQcKN5vZ6VKzfSzTZ65jkj5SMqRzhaRdSu78fTcoUkuKYg/IYGbrMibYDs/SZLuSsdyZDlbyP7PHOfc3SaMknaXkE5zvSbpPyUWm2XM5555zzv3VObfHObdY0m+V/KHLJ4+dSoa5vOKc267kE6lvNPnCUTatqY81I69jzWySkk8yz3LO7crjvCixYvYxM2uj5NPvOiXDj7oqGfJ0S5bzXiLpN5lvYpxzbzjn1jvn9jrn/ixpqpI3UtmOXZBerxr8TNJU51z4hg4V1lr6WHrsHZL+2Tm3J2zsnPurpH+Q9K+S3lcyv/hRfXqN3Clpg3Pu1865vznnfifpbSVv7FFBZe5jO9P/TnXO7UxHEvxO0jea62OBbNex70j6tpJCc39JF0p62My6N3OuoqHYAzI4577kPp1ouzxLk3WSjk1v1Tc4Vk0MX3POrXHOneKc6+KcO0PSFyQ9k3Gu4xrapsNTvtjUuZSM8W74uc0duyZtn3ksWoFW3scKysPMLpP0b5JOdc5tFFqFIvexzkq+uOJ259yu9M3z3Qo+PErvwo2Q9Jvm0lMwrM7MDpR0nj47NO9USbemw6oahjz90cz+qZmfgRJrRX3sYEmDJd2b9pFn0+c3NhQIzrk/OOeGOOc6Kxnx0k+fXiPDv5XKEqMCytzHGoaJZ3vf1Gwfk3JexwZKejj98H2vc26JpHeVfBlMWdR0sWdmbc3sAEn7KRlPe0C2uQSobkX+PT8uqV7S1Wb2d+ldDUla2sTPPjb9ee3N7Fol3wI1O929UNKXzWxsmt+NSsafv5Qee66Zfc7M2qTD8y6U9FA+xyq5iH3bzL5gZu2VvCF/eB9fM5pRxX1sv/T5tpLapOdp+KaxnHlY8q1k0yWd7px7Yx9fK/JUqT7mnNssab2kq9IcDlHy6XU4h+4iJV8G9HqQ95lmdli63U/JkPQHg2NHS/pAyVC6TEcr+bBiYPqQpHOU9GsUWZX2sa2SuuvTPtLw5v0ESU+nr2tQOoTzYCXftPm2c+6RtN1CSZ3M7JL0eniukiF9T+7j60YOrbWPpX1quaTr03P1l3SBkvdNzfaxVFPXsWclnZW+HzNLhoYeLWntPr7uwpX6G2Ba80PJPBMXPG6udF48WvfvWdIgJcsY7JT0nKRBGfumSPp9Rnyrkv/5tyuZpNsnONdpkl5Kz/W4pN4Z+5YruchsU/IlGxfke2y6/98l/SV9zJXUqdK/i1gfVdzHLs2S9+w881gv6W/pz2143Fnp30Wsjwr3sYFp3/lAyXyp+yQdFpzvJaXfcBg8f5uS4XM7JL2hZBhnu6DNI8rj24PFt3HSx7L0saBNbwXflChpvpK/pVuVLD3TLThmuKQ/p9ewlZKGV/p3EeujNfcxSYcrGeq5Pb1Wjc+3j6XPZ72OKRnJMFXJN4V+pORbQS8q57+7pYkAAAAAACJS08M4AQAAACBWFHsAAAAAECGKPQAAAACIEMUeAAAAAESIYg8AAAAAIlTQ2hZdu3Z1vXv3LlEqKLYNGzZo8+bN1nzL1oM+Vl3oYyiHVatWbXbOHVrpPPJFH6s+9DGUGn0MpdZUHyuo2Ovdu7dWrlxZvKxQUoMHD650CgWjj1UX+hjKwczerHQOhaCPVR/6GEqNPoZSa6qPMYwTAAAAACJEsQcAAAAAEaLYAwAAAIAIUewBAAAAQIQo9gAAAAAgQhR7AAAAABAhij0AAAAAiBDFHgAAAABEiGIPAAAAACJEsQcAAAAAEaLYAwAAAIAIUewBAAAAQIQo9gAAAAAgQm0rnUA1eu6557z4hBNO8OLnn3/ei4877rhSp4TIjR071ovr6uq8eObMmV48YcKEkueE6rJ8+XIvPvnkk72Y6xaKbdSoUV586qmnevHkyZPLmA0A1Cbu7AEAAABAhCj2AAAAACBCDOPMw549e7x46tSpXtymjV8z79ixo+Q5IW4vv/yyF4fDNoFCrV271os7duzoxYceemg500GErrnmGi9etGiRF48bN66M2aAW7N2714vDKQ8rVqzw4vBv6fDhw0uTGKJ11VVXefGdd97pxb169fLiDRs2lDqlZnFnDwAAAAAiRLEHAAAAABGi2AMAAACACDFnLw/z5s3z4nAewpVXXunFw4YNK3lOiNuUKVMKah9+pTlQX1/vxeFclcMOO8yLu3fvXvKcEJe3337bi2fMmOHF7du39+IRI0aUOiXUmFtuucWLH3zwwZztn3rqKS9mzh6a8+6773rxXXfd5cXh93aYWclzKhR39gAAAAAgQhR7AAAAABAhij0AAAAAiBBz9vJw7bXXenGHDh28+KKLLvLi1jheF61boevqzZw504v79u1b9JxQ3TZv3uzFS5cu9eI+ffqUMx1EaOHChV4c/u0bMmSIF7OWI1rKOefFy5Yty9k+nJvMWo8o1MqVK704XNuxGnBnDwAAAAAiRLEHAAAAABGi2AMAAACACDFnL4twHb1t27Z58ZlnnunFQ4cOLXlOiFu/fv0Kaj9hwoQSZYJYzJ07N+f+a665pjyJIFphHwrn7IXz3YGWCue3P/rooznbT5o0yYt79OhR9JwQl4cfftiLzz///JztO3bs6MWLFy8uek4txZ09AAAAAIgQxR4AAAAARIhiDwAAAAAixJw9SRs3bvTicHxufX29F0+ePLnkOSEu4TyDKVOmFHT8mDFjipkOasCLL77oxUcccYQX06fQUuEcPdaYRamNHTs25/6jjjrKi5mbjOaMHz/ei+fNm+fFu3fvznl8uJZj//79i5NYEXFnDwAAAAAiRLEHAAAAABGi2AMAAACACDFnT9IjjzzixeH43HANtMGDB5c8J8QlnKNXV1eXs304n2rBggVFzwlx2bNnjxeH6+xdeeWVXhzOMwCa8+qrr3qxcy5ne/5WoqXWrVvnxa+99poXt2vXzouXLVvmxR06dChNYojGli1bvPiTTz7J2f6QQw7x4vnz5xc7paLjzh4AAAAARIhiDwAAAAAiRLEHAAAAABGq2Tl7u3btaty+8cYbc7YNx4CH43WBbDLX1mtujl5o+vTpxU4HkZs1a5YXh+uD9ujRo5zpIELhnL1wXb3jjz/ei7t161bynBCXDz/80IsHDBiQs/0ll1zixd27dy92SojM+vXrvXjJkiUFHT979mwvHjhwYAszKj3u7AEAAABAhCj2AAAAACBCFHsAAAAAEKGanbN3//33N26/99573r6hQ4d6cefOncuSE+ISrs+YS7iuXt++fYudDiKUuc5ZuNZP165dvfjyyy8vS06Iy9atWxu3wz4UrrNX6NxkQPLXNSt0bcYf/vCHxU4HkVuzZo0Xf/zxxznbX3rppV58+umnFzulkuPOHgAAAABEiGIPAAAAACJEsQcAAAAAEarZOXv33Xdfk/tGjRrlxW3b1uw/EwqQua5eoRYsWFDETFAr3nrrrcbtJ554wtt3ww03eDFzj7Evtm3b1ri9adMmb1+4zh6wL1avXt24/cYbb+RsO2nSJC/u2bNnSXJCXJYuXdq4Ha7NGArnjc6cOdOLDzjggOIlVibc2QMAAACACFHsAQAAAECEamZ84rp167x4yZIljdvdunXz9vEV5dgXLVlqAdgXuYb/DhgwoIyZIFZPPvlk43a41MKRRx7pxV26dClLTqhu4XJXI0eOzPvYadOmeXG7du2KkhPi8uqrr3px5nuujz76KOexQ4YM8eJqHLYZ4s4eAAAAAESIYg8AAAAAIkSxBwAAAAARqpk5ew899JAX79mzp3H7sssu8/Z16tSpLDmhut1xxx15tw3n6LHUAvZFfX29F3//+99v3B43bpy3b/To0WXJCXFbu3Zt43a41MI555zjxe3bty9LTqhus2fP9uIdO3Y0bofzQm+//XYvPvjgg0uWF+Lx4x//2ItzzdO78MILvfjWW28tSU6VxJ09AAAAAIgQxR4AAAAARIhiDwAAAAAiFO2cvd27d3txOGcv0/vvv1/qdBChxx57LO+206dPL2EmqBVz5szx4sz5LT/60Y+8fW3a8FkeWi7z2hXO2fvBD35Q7nRQhVavXu3F119/fZNtR4wY4cVXXHFFKVJCldu5c6cXh9eihQsX5n2u8NgDDzxw3xNrpXg3AAAAAAARotgDAAAAgAhR7AEAAABAhKKdszdjxgwvfuaZZ7x45MiRTbYFsgnX1aurq8v72L59+xY7HdSA119/3YvHjx/vxd/85jcbtw8//PCy5IS4bdq0yYsz5+mFc/a6detWlpxQ3bZv3+7F4Vp6mWvn/epXv/L27b///qVLDFUrnMs5f/78nO07duzYuD1r1ixvX8+ePYuXWCvFnT0AAAAAiBDFHgAAAABEiGIPAAAAACIUzZy9+vp6L25uPtXZZ5/duB3jmhoovokTJxbU/qWXXipRJqgV4VpB4VyXzLX1WFcPxbBy5UovDvsc0Jzw/djUqVNztu/SpUvjdp8+fUqSE6pb+H4q19rZ2Zx11lmN26NGjSpGSlWFdwcAAAAAECGKPQAAAACIEMUeAAAAAEQomjl7W7Zs8eI//elPFcoEsQjX1WvOmDFjvJi19VCocF7CzTff7MWjR4/24mOOOabUKaHGvPDCC16cubbe5ZdfXu50UIUWLVrkxY899ljO9nPnzi1lOojAtGnTvHjHjh0523fo0MGLw3X5ag139gAAAAAgQhR7AAAAABAhij0AAAAAiFA0c/beeeedgtoPGjSoRJkgFs3NMwgtWLCgRJmgVtx7771eHK4Betddd5UzHdSArVu3evGMGTO8eO/evY3b48aNK0tOqG7NfWfCiSee6MVf+cpXSpkOIrBixYqC2j/wwANefPLJJxcxm+rDnT0AAAAAiBDFHgAAAABEiGIPAAAAACIUzZy9ZcuW5dx/0kknefGQIUNKmQ4iUFdXl3P/zJkzy5QJYnX//fd78fTp07141KhRXtyxY8dSp4QaM2fOHC/etGmTF59wwgmN28OGDStLTqguH3zwgRf/8pe/zNn++OOP9+K2baN5K4oiefPNN704nFscOuWUU7yYa5WPO3sAAAAAECGKPQAAAACIEMUeAAAAAEQomoHSP//5z3PuHzBggBe3a9eulOmgBkycODHn/gkTJpQpE1SrxYsXe7FzzotvuummcqaDGhTO0Qv74EEHHdS4zd9NZBOu//nhhx/mbH/bbbeVMBvEoFevXl4czlfftm2bF/fv39+LwzVqax139gAAAAAgQhR7AAAAABAhij0AAAAAiFA0c/buueceL7766qu9OFzXBWipcJ095uihpa677jovPuaYYyqUCWqVmeWMgVB9fX3O/UcddZQX7927t5TpIELnnXeeF//0pz+tUCbViTt7AAAAABAhij0AAAAAiFA0wziHDRvmxc8++2yFMkEswq8gB4pt1qxZlU4BNW7atGk5Y6A5V1xxhRevWbPGi1etWuXFGzZs8OJ+/fqVJC/E47TTTvPip59+2osvvvjicqZTdbizBwAAAAARotgDAAAAgAhR7AEAAABAhKKZswcAAIDy6tKlixeHS2EBLXXGGWfkjJEbd/YAAAAAIEIUewAAAAAQIYo9AAAAAIgQxR4AAAAARIhiDwAAAAAiRLEHAAAAABGi2AMAAACACFHsAQAAAECEKPYAAAAAIEIUewAAAAAQIYo9AAAAAIiQOefyb2z2F0lvli4dFFkv59yhlU6iEPSxqkMfQzlUVT+jj1Ul+hhKjT6GUsvaxwoq9gAAAAAA1YFhnAAAAAAQIYo9AAAAAIgQxR4AAAAARIhiDwAAAAAiRLEHAAAAABGi2AMAAACACFHsAQAAAECEKPYAAAAAIEIUewAAAAAQof8HO5hg4WltOfMAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + ] +} \ No newline at end of file From 8719b0c6b11744419db98f8e203eb571851edbd3 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 13 Mar 2023 14:38:26 -0700 Subject: [PATCH 30/35] applying fixes to PR review comments --- tensorflow_similarity/base_indexer.py | 14 ++++++++++---- tensorflow_similarity/indexer.py | 11 +++-------- tensorflow_similarity/stores/cached_store.py | 11 +++++++---- tensorflow_similarity/stores/memory_store.py | 3 --- tensorflow_similarity/stores/redis_store.py | 2 +- 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index bdaa0d46..e0e111d1 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -23,14 +23,20 @@ class BaseIndexer(ABC): - def __init__(self, distance, embedding_output, embedding_size, evaluator, stat_buffer_size): + def __init__( + self, + distance: Union[Distance, str], + embedding_output: int, + embedding_size: int, + evaluator: Union[Evaluator, str], + stat_buffer_size: int, + ) -> None: distance = distance_canonicalizer(distance) self.distance = distance # needed for save()/load() self.embedding_output = embedding_output self.embedding_size = embedding_size # internal structure naming - # FIXME support custom objects self.evaluator_type = evaluator # code used to evaluate indexer performance @@ -157,11 +163,11 @@ def evaluate_classification( query_labels = tf.convert_to_tensor(np.array(target_labels)) # TODO(ovallis): The float type should be derived from the model. - lookup_distances = unpack_lookup_distances(lookups, dtype="float32") + lookup_distances = unpack_lookup_distances(lookups, dtype=tf.keras.backend.floatx()) lookup_labels = unpack_lookup_labels(lookups, dtype=query_labels.dtype) thresholds: FloatTensor = tf.cast( tf.convert_to_tensor(distance_thresholds), - dtype=lookup_distances.dtype, + dtype=tf.keras.backend.floatx(), ) results: dict[str, np.ndarray] = self.evaluator.evaluate_classification( diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 569c5e26..f6c4d8ea 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -102,7 +102,7 @@ def __init__( super().__init__(distance, embedding_output, embedding_size, evaluator, stat_buffer_size) # internal structure naming # FIXME support custom objects - self.search_type = search if isinstance(search, str) else type(search).__name__ + self.search_type = search if isinstance(search, str) else type(search).name if isinstance(search, Search): self.search: Search = search self.kv_store_type = kv_store if isinstance(kv_store, str) else type(kv_store).__name__ @@ -122,7 +122,7 @@ def _init_structures(self) -> None: self.search = NMSLibSearch(distance=self.distance, dim=self.embedding_size) elif self.search_type == "linear": self.search = LinearSearch(distance=self.distance, dim=self.embedding_size) - elif not isinstance(self.search, Search): + elif not hasattr(self, "search") or not isinstance(self.search, Search): # self.search should have been already initialized raise ValueError("You need to either supply a known search " "framework name or a Search() object") @@ -131,15 +131,10 @@ def _init_structures(self) -> None: self.kv_store = MemoryStore() elif isinstance(self.kv_store_type, Store): self.kv_store = self.kv_store_type - elif not isinstance(self.kv_store, Store): + elif not hasattr(self, "search") or not isinstance(self.kv_store, Store): # self.kv_store should have been already initialized raise ValueError("You need to either supply a know key value " "store name or a Store() object") - if not self.search: - raise ValueError("search not initialized") - if not self.kv_store: - raise ValueError("kv_store not initialized") - # stats self._stats: DefaultDict[str, int] = defaultdict(int) self._lookup_timings_buffer: Deque[float] = deque([], maxlen=self.stat_buffer_size) diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index 2afcdf80..02c987cb 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -31,7 +31,7 @@ class CachedStore(Store): """Efficient cached dataset store""" - def __init__(self, shard_size=1000000, path=".", num_items=0, **kw_args) -> None: + def __init__(self, shard_size: int = 1000000, path: str = ".", num_items: int = 0, **kw_args) -> None: # We are using a native python cached dictionary # db[id] = pickle((embedding, label, data)) self.db: list[dict[str, str]] = [] @@ -53,6 +53,9 @@ def __reopen_all_shards(self): for shard_no in range(len(self.db)): self.db[shard_no] = self.__make_new_shard(shard_no) + def __get_shard_no(self, idx: int) -> int: + return idx // self.shard_size + def add( self, embedding: FloatTensor, @@ -72,7 +75,7 @@ def add( Associated record id. """ idx = self.num_items - shard_no = idx // self.shard_size + shard_no = self.__get_shard_no(idx) if len(self.db) <= shard_no: self.__add_new_shard() self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, data)) @@ -105,7 +108,7 @@ def batch_add( idx = i + self.num_items label = None if labels is None else labels[i] rec_data = None if data is None else data[i] - shard_no = idx // self.shard_size + shard_no = self.__get_shard_no(idx) if len(self.db) <= shard_no: self.__add_new_shard() self.db[shard_no][str(idx)] = pickle.dumps((embedding, label, rec_data)) @@ -124,7 +127,7 @@ def get(self, idx: int) -> tuple[FloatTensor, int | None, Tensor | None]: record associated with the requested id. """ - shard_no = idx // self.shard_size + shard_no = self.__get_shard_no(idx) embedding, label, data = pickle.loads(self.db[shard_no][str(idx)]) return embedding, label, data diff --git a/tensorflow_similarity/stores/memory_store.py b/tensorflow_similarity/stores/memory_store.py index 6792cf4b..fbdc42c9 100644 --- a/tensorflow_similarity/stores/memory_store.py +++ b/tensorflow_similarity/stores/memory_store.py @@ -207,6 +207,3 @@ def to_data_frame(self, num_records: int = 0) -> PandasDataFrame: # forcing type from Any to PandasFrame df: PandasDataFrame = pd.DataFrame.from_dict(data) return df - - def get_config(self): - return super().get_config() diff --git a/tensorflow_similarity/stores/redis_store.py b/tensorflow_similarity/stores/redis_store.py index 2cad7610..4fd91418 100644 --- a/tensorflow_similarity/stores/redis_store.py +++ b/tensorflow_similarity/stores/redis_store.py @@ -29,7 +29,7 @@ class RedisStore(Store): """Efficient Redis dataset store""" - def __init__(self, host="localhost", port=6379, db=0, **kw_args) -> None: + def __init__(self, host: str = "localhost", port: int = 6379, db: int = 0, **kw_args) -> None: # Currently does not support authentication self.host = host self.port = port From 7eab295be4cada66a88e1bfd44f7fbdde198f06a Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 13 Mar 2023 15:22:31 -0700 Subject: [PATCH 31/35] typo --- tensorflow_similarity/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index f6c4d8ea..17dee58a 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -102,7 +102,7 @@ def __init__( super().__init__(distance, embedding_output, embedding_size, evaluator, stat_buffer_size) # internal structure naming # FIXME support custom objects - self.search_type = search if isinstance(search, str) else type(search).name + self.search_type = search if isinstance(search, str) else search.name if isinstance(search, Search): self.search: Search = search self.kv_store_type = kv_store if isinstance(kv_store, str) else type(kv_store).__name__ From ca3a803c4204551b1f7c3870aae09c975d2e0acd Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 13 Mar 2023 20:01:27 -0700 Subject: [PATCH 32/35] fix the tests for no normalization --- tensorflow_similarity/base_indexer.py | 1 + tensorflow_similarity/search/faiss_search.py | 32 +++++----- tensorflow_similarity/search/linear_search.py | 60 +++++++------------ tests/search/test_linear_search.py | 28 ++++----- 4 files changed, 51 insertions(+), 70 deletions(-) diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index e0e111d1..a92711c1 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -3,6 +3,7 @@ from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import Mapping, MutableMapping, Sequence +from typing import Union import numpy as np import tensorflow as tf diff --git a/tensorflow_similarity/search/faiss_search.py b/tensorflow_similarity/search/faiss_search.py index f1241dd8..1b714076 100644 --- a/tensorflow_similarity/search/faiss_search.py +++ b/tensorflow_similarity/search/faiss_search.py @@ -96,27 +96,25 @@ def __init__( if distance == "cosine": # this is exact match using cosine/dot-product Distance self.index = faiss.IndexFlatIP(dim) - else: + elif distance == "l2": # this is exact match using L2 distance self.index = faiss.IndexFlatL2(dim) + else: + raise ValueError(f"distance {distance} not supported") def is_built(self): - return self.built - - def needs_building(self): - if self.algo == "flat": - return False - else: - return not self.index.is_trained + return self.algo == "flat" or self.index.is_trained - def build_index(self, samples, **kwargss): + def build_index(self, samples, normalize=True, **kwargss): if self.algo == "ivfpq": - if self.normalize: + if normalize: faiss.normalize_L2(samples) self.index.train(samples) # we must train the index to cluster into cells self.built = True - def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[int]], list[list[float]]]: + def batch_lookup( + self, embeddings: FloatTensor, k: int = 5, normalize: bool = True + ) -> tuple[list[list[int]], list[list[float]]]: """Find embeddings K nearest neighboors embeddings. Args: @@ -124,12 +122,12 @@ def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[i k: Number of nearest neighboors embedding to lookup. Defaults to 5. """ - if self.normalize: + if normalize: faiss.normalize_L2(embeddings) sims, indices = self.index.search(embeddings, k) return indices, sims - def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[float]]: + def lookup(self, embedding: FloatTensor, k: int = 5, normalize: bool = True) -> tuple[list[int], list[float]]: """Find embedding K nearest neighboors embeddings. Args: @@ -137,12 +135,12 @@ def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[fl k: Number of nearest neighboors embedding to lookup. Defaults to 5. """ int_embedding = np.array([embedding], dtype=np.float32) - if self.normalize: + if normalize: faiss.normalize_L2(int_embedding) sims, indices = self.index.search(int_embedding, k) return indices[0], sims[0] - def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): + def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, normalize: bool = True, **kwargs): """Add a single embedding to the search index. Args: @@ -151,7 +149,7 @@ def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): allow to lookup the data associated with a given embedding. """ int_embedding = np.array([embedding], dtype=np.float32) - if self.normalize: + if normalize: faiss.normalize_L2(int_embedding) if self.algo != "flat": self.index.add_with_ids(int_embedding) @@ -175,7 +173,7 @@ def batch_add( embeddings. verbose: Be verbose. Defaults to 1. """ - if self.normalize: + if normalize: faiss.normalize_L2(embeddings) if self.algo != "flat": # flat does not accept indexes as parameters and assumes incremental diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index 75e9323d..bc316fac 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -58,7 +58,9 @@ def is_built(self): def needs_building(self): return False - def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[int]], list[list[float]]]: + def batch_lookup( + self, embeddings: FloatTensor, k: int = 5, normalize: bool = True + ) -> tuple[list[list[int]], list[list[float]]]: """Find embeddings K nearest neighboors embeddings. Args: @@ -67,39 +69,17 @@ def batch_lookup(self, embeddings: FloatTensor, k: int = 5) -> tuple[list[list[i """ items = len(self.ids) - if self.distance.name == "cosine": - normalized_query = tf.math.l2_normalize(embeddings, axis=1) - sims = tf.matmul(normalized_query, tf.transpose(self.db[:items])) - similarity, id_idxs = tf.math.top_k(sims, k) - ids_array = np.array(self.ids) - return list(np.array([ids_array[x.numpy()] for x in id_idxs])), list(similarity) - elif self.distance.name in ("euclidean", "squared_euclidean"): - normalized_query = tf.math.l2_normalize(embeddings, axis=1) - items = len(self.ids) - assert ( - normalized_query.shape.as_list()[-1] == self.db.shape[-1] - ), "the last dimension should have the same size" - query_norms = tf.reduce_sum(tf.square(normalized_query), axis=1) - query_norms = tf.reshape(query_norms, [-1, 1]) # Only one column per row - - db_norms = tf.reduce_sum(tf.square(self.db[:items]), axis=1) - db_norms = tf.reshape(db_norms, [-1, 1]) # Only one column per row - - dists = query_norms - 2 * tf.matmul(normalized_query, tf.transpose(self.db[:items])) + db_norms - dists, id_idxs = tf.math.top_k(-dists, k) - dists = -dists - ids_array = np.array(self.ids) - return list(np.array([ids_array[x.numpy()] for x in id_idxs])), list(dists) - elif self.distance.name == "manhattan": - dists = tf.reduce_sum(tf.abs(tf.subtract(self.db[:items], tf.expand_dims(embeddings, 1))), axis=2) - dists, id_idxs = tf.math.top_k(-dists, k) - dists = -dists - ids_array = np.array(self.ids) - return list(np.array([ids_array[x.numpy()] for x in id_idxs])), list(dists) + if normalize: + query = tf.math.l2_normalize(embeddings, axis=1) else: - raise ValueError("Unsupported metric space") - - def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[float]]: + query = embeddings + sims = self.distance(query, self.db[:items]) + similarity, id_idxs = tf.math.top_k(sims, k) + id_idxs = id_idxs.numpy() + ids_array = np.array(self.ids) + return list(np.array([ids_array[x] for x in id_idxs])), list(similarity) + + def lookup(self, embedding: FloatTensor, k: int = 5, normalize: bool = True) -> tuple[list[int], list[float]]: """Find embedding K nearest neighboors embeddings. Args: @@ -107,10 +87,10 @@ def lookup(self, embedding: FloatTensor, k: int = 5) -> tuple[list[int], list[fl k: Number of nearest neighboors embedding to lookup. Defaults to 5. """ embeddings: FloatTensor = tf.convert_to_tensor([embedding], dtype=np.float32) - idxs, dists = self.batch_lookup(embeddings, k=k) + idxs, dists = self.batch_lookup(embeddings, k=k, normalize=normalize) return idxs[0], dists[0] - def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): + def add(self, embedding: FloatTensor, idx: int, normalize: bool = True, verbose: int = 1, **kwargs): """Add a single embedding to the search index. Args: @@ -118,7 +98,8 @@ def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): idx: Embedding id as in the index table. Returned with the embedding to allow to lookup the data associated with a given embedding. """ - int_embedding = tf.math.l2_normalize(np.array([embedding], dtype=np.float32), axis=1) + if normalize: + embedding = tf.math.l2_normalize(np.array([embedding], dtype=tf.keras.backend.floatx()), axis=1) items = len(self.ids) if items + 1 > self.db.shape[0]: # it's full @@ -126,7 +107,7 @@ def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, **kwargs): new_db[:items] = self.db self.db = new_db self.ids.append(idx) - self.db[items] = int_embedding + self.db[items] = embedding def batch_add( self, @@ -145,7 +126,8 @@ def batch_add( embeddings. verbose: Be verbose. Defaults to 1. """ - int_embeddings = tf.math.l2_normalize(embeddings, axis=1) + if normalize: + embeddings = tf.math.l2_normalize(embeddings, axis=1) items = len(self.ids) if items + len(embeddings) > self.db.shape[0]: # it's full @@ -156,7 +138,7 @@ def batch_add( new_db[:items] = self.db self.db = new_db self.ids.extend(idxs) - self.db[items : items + len(embeddings)] = int_embeddings + self.db[items : items + len(embeddings)] = embeddings def __make_file_path(self, path): return Path(path) / "index.pickle" diff --git a/tests/search/test_linear_search.py b/tests/search/test_linear_search.py index 0a86a0b1..5e091764 100644 --- a/tests/search/test_linear_search.py +++ b/tests/search/test_linear_search.py @@ -8,10 +8,10 @@ def test_index_match(): embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") search_index = LinearSearch("cosine", 3) - search_index.add(embs[0], 0) - search_index.add(embs[1], 1) + search_index.add(embs[0], 0, normalize=False) + search_index.add(embs[1], 1, normalize=False) - idxs, embs = search_index.lookup(target, k=2) + idxs, embs = search_index.lookup(target, k=2, normalize=False) assert len(embs) == 2 assert list(idxs) == [0, 1] @@ -36,13 +36,13 @@ def test_index_match_l2(): embs = np.array([[1, 1, 3], [3, 1, 2]], dtype="float32") search_index = LinearSearch("l2", 3) - search_index.add(embs[0], 0) - search_index.add(embs[1], 1) + search_index.add(embs[0], 0, normalize=False) + search_index.add(embs[1], 1, normalize=False) - idxs, embs = search_index.lookup(target, k=2) + idxs, embs = search_index.lookup(target, k=2, normalize=False) assert len(embs) == 2 - assert list(idxs) == [0, 1] + assert list(idxs) == [1, 0] def test_index_save(tmp_path): @@ -51,10 +51,10 @@ def test_index_save(tmp_path): k = 2 search_index = LinearSearch("cosine", 3) - search_index.add(embs[0], 0) - search_index.add(embs[1], 1) + search_index.add(embs[0], 0, normalize=False) + search_index.add(embs[1], 1, normalize=False) - idxs, embs = search_index.lookup(target, k=k) + idxs, embs = search_index.lookup(target, k=k, normalize=False) assert len(embs) == k assert list(idxs) == [0, 1] @@ -64,16 +64,16 @@ def test_index_save(tmp_path): search_index2 = LinearSearch("cosine", 3) search_index2.load(tmp_path) - idxs2, embs2 = search_index.lookup(target, k=k) + idxs2, embs2 = search_index.lookup(target, k=k, normalize=False) assert len(embs2) == k assert list(idxs2) == [0, 1] # add more # if the dtype is not passed we get an incompatible type error - search_index2.add(np.array([3.0, 3.0, 3.0], dtype="float32"), 3) - idxs3, embs3 = search_index2.lookup(target, k=3) + search_index2.add(np.array([3.0, 3.0, 3.0], dtype="float32"), 3, normalize=False) + idxs3, embs3 = search_index2.lookup(target, k=3, normalize=False) assert len(embs3) == 3 - assert list(idxs3) == [0, 3, 1] + assert list(idxs3) == [0, 1, 3] def test_batch_vs_single(tmp_path): From 7a5321c4ae006d65fd4ee35d50c6ba5eeb5cb601 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 13 Mar 2023 20:17:00 -0700 Subject: [PATCH 33/35] add distance --- tensorflow_similarity/base_indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index a92711c1..b300377b 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -15,7 +15,7 @@ F1Score, make_classification_metric, ) -from .distances import distance_canonicalizer +from .distances import Distance, distance_canonicalizer from .evaluators import Evaluator, MemoryEvaluator from .matchers import ClassificationMatch, make_classification_matcher from .retrieval_metrics import RetrievalMetric From f7c1d4c402dc3efe8b48a370df242d3bd0c6167b Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 13 Mar 2023 22:36:59 -0700 Subject: [PATCH 34/35] fix typing --- tensorflow_similarity/base_indexer.py | 6 ++--- tensorflow_similarity/indexer.py | 2 +- tensorflow_similarity/search/linear_search.py | 26 +++++-------------- tensorflow_similarity/stores/cached_store.py | 2 +- 4 files changed, 11 insertions(+), 25 deletions(-) diff --git a/tensorflow_similarity/base_indexer.py b/tensorflow_similarity/base_indexer.py index b300377b..57b31603 100644 --- a/tensorflow_similarity/base_indexer.py +++ b/tensorflow_similarity/base_indexer.py @@ -3,7 +3,7 @@ from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import Mapping, MutableMapping, Sequence -from typing import Union +from typing import Optional, Union import numpy as np import tensorflow as tf @@ -27,7 +27,7 @@ class BaseIndexer(ABC): def __init__( self, distance: Union[Distance, str], - embedding_output: int, + embedding_output: Optional[int], embedding_size: int, evaluator: Union[Evaluator, str], stat_buffer_size: int, @@ -44,7 +44,7 @@ def __init__( if self.evaluator_type == "memory": self.evaluator: Evaluator = MemoryEvaluator() elif isinstance(self.evaluator_type, Evaluator): - self.evaluator: Evaluator = self.evaluator_type + self.evaluator = self.evaluator_type else: raise ValueError("You need to either supply a know evaluator name " "or an Evaluator() object") diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index 17dee58a..d24db73c 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -66,7 +66,7 @@ def __init__( search: Union[Search, str] = "nmslib", kv_store: Union[Store, str] = "memory", evaluator: Union[Evaluator, str] = "memory", - embedding_output: int = None, + embedding_output: Optional[int] = None, stat_buffer_size: int = 1000, ) -> None: """Index embeddings to make them searchable via KNN diff --git a/tensorflow_similarity/search/linear_search.py b/tensorflow_similarity/search/linear_search.py index bc316fac..0754ff07 100644 --- a/tensorflow_similarity/search/linear_search.py +++ b/tensorflow_similarity/search/linear_search.py @@ -49,7 +49,7 @@ def __init__(self, distance: Distance | str, dim: int, verbose: int = 0, name: s f"| - name: {self.name}", ] cprint("\n".join(t_msg) + "\n", "green") - self.db = np.empty((INITIAL_DB_SIZE, dim), dtype=np.float32) + self.db: List[FloatTensor] = [] self.ids: List[int] = [] def is_built(self): @@ -73,7 +73,8 @@ def batch_lookup( query = tf.math.l2_normalize(embeddings, axis=1) else: query = embeddings - sims = self.distance(query, self.db[:items]) + db_tensor = tf.convert_to_tensor(self.db) + sims = self.distance(query, db_tensor) similarity, id_idxs = tf.math.top_k(sims, k) id_idxs = id_idxs.numpy() ids_array = np.array(self.ids) @@ -90,7 +91,7 @@ def lookup(self, embedding: FloatTensor, k: int = 5, normalize: bool = True) -> idxs, dists = self.batch_lookup(embeddings, k=k, normalize=normalize) return idxs[0], dists[0] - def add(self, embedding: FloatTensor, idx: int, normalize: bool = True, verbose: int = 1, **kwargs): + def add(self, embedding: FloatTensor, idx: int, verbose: int = 1, normalize: bool = True, **kwargs): """Add a single embedding to the search index. Args: @@ -100,14 +101,8 @@ def add(self, embedding: FloatTensor, idx: int, normalize: bool = True, verbose: """ if normalize: embedding = tf.math.l2_normalize(np.array([embedding], dtype=tf.keras.backend.floatx()), axis=1) - items = len(self.ids) - if items + 1 > self.db.shape[0]: - # it's full - new_db = np.empty((len(self.ids) + DB_SIZE_STEPS, self.dim), dtype=np.float32) - new_db[:items] = self.db - self.db = new_db self.ids.append(idx) - self.db[items] = embedding + self.db.append(embedding) def batch_add( self, @@ -128,17 +123,8 @@ def batch_add( """ if normalize: embeddings = tf.math.l2_normalize(embeddings, axis=1) - items = len(self.ids) - if items + len(embeddings) > self.db.shape[0]: - # it's full - new_db = np.empty( - (((items + len(embeddings) + DB_SIZE_STEPS) // DB_SIZE_STEPS) * DB_SIZE_STEPS, self.dim), - dtype=np.float32, - ) - new_db[:items] = self.db - self.db = new_db self.ids.extend(idxs) - self.db[items : items + len(embeddings)] = embeddings + self.db.extend(embeddings) def __make_file_path(self, path): return Path(path) / "index.pickle" diff --git a/tensorflow_similarity/stores/cached_store.py b/tensorflow_similarity/stores/cached_store.py index 02c987cb..a4cb016d 100644 --- a/tensorflow_similarity/stores/cached_store.py +++ b/tensorflow_similarity/stores/cached_store.py @@ -34,7 +34,7 @@ class CachedStore(Store): def __init__(self, shard_size: int = 1000000, path: str = ".", num_items: int = 0, **kw_args) -> None: # We are using a native python cached dictionary # db[id] = pickle((embedding, label, data)) - self.db: list[dict[str, str]] = [] + self.db: list[dict[str, bytes]] = [] self.shard_size = shard_size self.num_items: int = num_items self.path: str = path From caa206aa9b54841f2a863cd77e44884560cd5809 Mon Sep 17 00:00:00 2001 From: Ali Zand Date: Mon, 20 Mar 2023 21:17:41 -0700 Subject: [PATCH 35/35] remove double definition --- tensorflow_similarity/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_similarity/indexer.py b/tensorflow_similarity/indexer.py index fd4f2523..d64ba8ea 100644 --- a/tensorflow_similarity/indexer.py +++ b/tensorflow_similarity/indexer.py @@ -119,7 +119,7 @@ def _init_structures(self) -> None: "(re)initialize internal storage structure" if self.search_type == "nmslib": - self.search: Search = NMSLibSearch(distance=self.distance, dim=self.embedding_size) + self.search = NMSLibSearch(distance=self.distance, dim=self.embedding_size) elif self.search_type == "linear": self.search = LinearSearch(distance=self.distance, dim=self.embedding_size) elif isinstance(self.search_type, Search):