diff --git a/docs/examples/openai_qna.ipynb b/docs/examples/openai_qna.ipynb index d0bafd7b..d3c1bf91 100644 --- a/docs/examples/openai_qna.ipynb +++ b/docs/examples/openai_qna.ipynb @@ -710,13 +710,11 @@ "source": [ "import os\n", "from redisvl.vectorize.text import OpenAITextVectorizer\n", - "from redisvl.utils.utils import array_to_buffer\n", "\n", "api_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n", "oaip = OpenAITextVectorizer(EMBEDDINGS_MODEL, api_config={\"api_key\": api_key})\n", "\n", - "chunked_data[\"embedding\"] = oaip.embed_many(chunked_data[\"content\"].tolist())\n", - "chunked_data[\"embedding\"] = chunked_data[\"embedding\"].apply(lambda x: array_to_buffer(x))\n", + "chunked_data[\"embedding\"] = oaip.embed_many(chunked_data[\"content\"].tolist(), as_buffer=True)\n", "chunked_data" ] }, diff --git a/docs/user_guide/vectorizers_03.ipynb b/docs/user_guide/vectorizers_03.ipynb index bbf61886..d25cf8bc 100644 --- a/docs/user_guide/vectorizers_03.ipynb +++ b/docs/user_guide/vectorizers_03.ipynb @@ -105,7 +105,7 @@ " \"Today is a sunny day\"\n", "]\n", "\n", - "embeddings = hf.embed_many(sentences)\n" + "embeddings = hf.embed_many(sentences, as_buffer=True)\n" ] }, { @@ -183,7 +183,7 @@ "# the vector is stored as a bytes buffer\n", "\n", "data = [{\"text\": t,\n", - " \"embedding\": array_to_buffer(v)}\n", + " \"embedding\": v}\n", " for t, v in zip(sentences, embeddings)]\n", "\n", "index.load(data)" diff --git a/redisvl/vectorize/base.py b/redisvl/vectorize/base.py index 64f32568..592f7938 100644 --- a/redisvl/vectorize/base.py +++ b/redisvl/vectorize/base.py @@ -1,5 +1,7 @@ from typing import Callable, Dict, List, Optional +from redisvl.utils.utils import array_to_buffer + class BaseVectorizer: def __init__(self, model: str, dims: int, api_config: Optional[Dict] = None): @@ -21,27 +23,35 @@ def set_model(self, model: str, dims: Optional[int] = None) -> None: def embed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: Optional[int] = 1000, + as_buffer: Optional[bool] = False, ) -> List[List[float]]: raise NotImplementedError def embed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[bool] = False, ) -> List[float]: raise NotImplementedError async def aembed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: Optional[int] = 1000, + as_buffer: Optional[bool] = False, ) -> List[List[float]]: raise NotImplementedError async def aembed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[bool] = False, ) -> List[float]: raise NotImplementedError @@ -51,3 +61,8 @@ def batchify(self, seq: list, size: int, preprocess: Optional[Callable] = None): yield [preprocess(chunk) for chunk in seq[pos : pos + size]] else: yield seq[pos : pos + size] + + def _process_embedding(self, embedding: List[float], as_buffer: bool): + if as_buffer: + return array_to_buffer(embedding) + return embedding diff --git a/redisvl/vectorize/text/huggingface.py b/redisvl/vectorize/text/huggingface.py index b46bfe0a..b45349fa 100644 --- a/redisvl/vectorize/text/huggingface.py +++ b/redisvl/vectorize/text/huggingface.py @@ -4,6 +4,7 @@ class HFTextVectorizer(BaseVectorizer): + # TODO - add docstring def __init__(self, model: str, api_config: Optional[Dict] = None): # TODO set dims based on model dims = 768 @@ -18,21 +19,57 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): self._model_client = SentenceTransformer(model) def embed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[float] = False, ) -> List[float]: + """Embed a chunk of text using the Hugging Face sentence transformer. + + Args: + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[float]: Embedding. + """ if preprocess: - emb_input = preprocess(emb_input) - embedding = self._model_client.encode([emb_input])[0] - return embedding.tolist() + text = preprocess(text) + embedding = self._model_client.encode([text])[0] + return self._process_embedding(embedding.tolist(), as_buffer) def embed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: int = 1000, + as_buffer: Optional[float] = None, ) -> List[List[float]]: - embeddings = [] - for batch in self.batchify(inputs, chunk_size, preprocess): + """Asynchronously embed many chunks of texts using the Hugging Face sentence + transformer. + + Args: + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[List[float]]: List of embeddings. + """ + embeddings: List = [] + for batch in self.batchify(texts, batch_size, preprocess): batch_embeddings = self._model_client.encode(batch) - embeddings.extend([embedding.tolist() for embedding in batch_embeddings]) + embeddings.extend( + [ + self._process_embedding(embedding.tolist(), as_buffer) + for embedding in batch_embeddings + ] + ) return embeddings diff --git a/redisvl/vectorize/text/openai.py b/redisvl/vectorize/text/openai.py index ac24a05b..f8a63b58 100644 --- a/redisvl/vectorize/text/openai.py +++ b/redisvl/vectorize/text/openai.py @@ -1,9 +1,16 @@ from typing import Callable, Dict, List, Optional +from tenacity import ( # for exponential backoff + retry, + stop_after_attempt, + wait_random_exponential, +) + from redisvl.vectorize.base import BaseVectorizer class OpenAITextVectorizer(BaseVectorizer): + # TODO - add docstring def __init__(self, model: str, api_config: Optional[Dict] = None): dims = 1536 super().__init__(model, dims, api_config) @@ -18,42 +25,112 @@ def __init__(self, model: str, api_config: Optional[Dict] = None): openai.api_key = api_config.get("api_key", None) self._model_client = openai.Embedding + @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) def embed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: Optional[int] = 10, + as_buffer: Optional[float] = False, ) -> List[List[float]]: - results = [] - for batch in self.batchify(inputs, chunk_size, preprocess): + """Embed many chunks of texts using the OpenAI API. + + Args: + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[List[float]]: List of embeddings. + """ + embeddings: List = [] + for batch in self.batchify(texts, batch_size, preprocess): response = self._model_client.create(input=batch, engine=self._model) - results += [r["embedding"] for r in response["data"]] - return results + embeddings += [ + self._process_embedding(r["embedding"], as_buffer) + for r in response["data"] + ] + return embeddings + @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) def embed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[float] = False, ) -> List[float]: + """Embed a chunk of text using the OpenAI API. + + Args: + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[float]: Embedding. + """ if preprocess: - emb_input = preprocess(emb_input) - result = self._model_client.create(input=[emb_input], engine=self._model) - return result["data"][0]["embedding"] + text = preprocess(text) + result = self._model_client.create(input=[text], engine=self._model) + return self._process_embedding(result["data"][0]["embedding"], as_buffer) + @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) async def aembed_many( self, - inputs: List[str], + texts: List[str], preprocess: Optional[Callable] = None, - chunk_size: int = 1000, + batch_size: int = 1000, + as_buffer: Optional[bool] = False, ) -> List[List[float]]: - results = [] - for batch in self.batchify(inputs, chunk_size, preprocess): + """Asynchronously embed many chunks of texts using the OpenAI API. + + Args: + texts (List[str]): List of text chunks to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + batch_size (int, optional): Batch size of texts to use when creating + embeddings. Defaults to 10. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[List[float]]: List of embeddings. + """ + embeddings: List = [] + for batch in self.batchify(texts, batch_size, preprocess): response = await self._model_client.acreate(input=batch, engine=self._model) - results += [r["embedding"] for r in response["data"]] - return results + embeddings += [ + self._process_embedding(r["embedding"], as_buffer) + for r in response["data"] + ] + return embeddings + @retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) async def aembed( - self, emb_input: str, preprocess: Optional[Callable] = None + self, + text: str, + preprocess: Optional[Callable] = None, + as_buffer: Optional[bool] = False, ) -> List[float]: + """Asynchronously embed a chunk of text using the OpenAI API. + + Args: + text (str): Chunk of text to embed. + preprocess (Optional[Callable], optional): Optional preprocessing callable to + perform before vectorization. Defaults to None. + as_buffer (Optional[float], optional): Whether to convert the raw embedding + to a byte string. Defaults to False. + + Returns: + List[float]: Embedding. + """ if preprocess: - emb_input = preprocess(emb_input) - result = await self._model_client.acreate(input=[emb_input], engine=self._model) - return result["data"][0]["embedding"] + text = preprocess(text) + result = await self._model_client.acreate(input=[text], engine=self._model) + return self._process_embedding(result["data"][0]["embedding"], as_buffer) diff --git a/requirements.txt b/requirements.txt index c74fdbd7..722f3596 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ numpy redis>=4.3.4 pyyaml coloredlogs -pydantic>=2.0.0 \ No newline at end of file +pydantic>=2.0.0 +tenacity==8.2.2 \ No newline at end of file