Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ability to clear ALL data associated with an index #179

Merged
merged 7 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions docs/user_guide/getting_started_01.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -653,13 +653,44 @@
"## Cleanup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Below we will clean up after our work. First, you can optionally flush all data from Redis associated with the index by\n",
"using the `.clear()` method. This will leave the secondary index in place for future insertions or updates.\n",
"\n",
"But if you want to clean up everything, including the index, just use `.delete()`\n",
"which will by default remove the index AND the underlying data."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# (optionally) clear all data from Redis associated with the index\n",
"await index.clear()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# but the index is still in place\n",
"await index.exists()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# clean up the index\n",
"# remove / delete the index in its entirety\n",
"await index.delete()"
]
}
Expand All @@ -680,7 +711,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
"version": "3.10.14"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
12 changes: 6 additions & 6 deletions docs/user_guide/vectorizers_04.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -531,14 +531,14 @@
}
],
"source": [
"from redisvl.utils.vectorize import MistralAITextVectorizer\n",
"# from redisvl.utils.vectorize import MistralAITextVectorizer\n",
"\n",
"mistral = MistralAITextVectorizer()\n",
"# mistral = MistralAITextVectorizer()\n",
"\n",
"# mebed a sentence using their asyncronous method\n",
"test = await mistral.aembed(\"This is a test sentence.\")\n",
"print(\"Vector dimensions: \", len(test))\n",
"print(test[:10])"
"# # embed a sentence using their asyncronous method\n",
"# test = await mistral.aembed(\"This is a test sentence.\")\n",
"# print(\"Vector dimensions: \", len(test))\n",
"# print(test[:10])"
]
},
{
Expand Down
5 changes: 1 addition & 4 deletions redisvl/extensions/llmcache/semantic.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,7 @@ def set_vectorizer(self, vectorizer: BaseVectorizer) -> None:

def clear(self) -> None:
"""Clear the cache of all keys while preserving the index."""
with self._index.client.pipeline(transaction=False) as pipe: # type: ignore
for key in self._index.client.scan_iter(match=f"{self._index.prefix}:*"): # type: ignore
pipe.delete(key)
pipe.execute()
self._index.clear()

def delete(self) -> None:
"""Clear the semantic cache of all keys and remove the underlying search
Expand Down
5 changes: 1 addition & 4 deletions redisvl/extensions/session_manager/semantic_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,7 @@ def set_scope(

def clear(self) -> None:
"""Clears the chat session history."""
with self._index.client.pipeline(transaction=False) as pipe: # type: ignore
for key in self._index.client.scan_iter(match=f"{self._index.prefix}:*"): # type: ignore
pipe.delete(key)
pipe.execute()
self._index.clear()

def delete(self) -> None:
"""Clear all conversation keys and remove the search index."""
Expand Down
43 changes: 42 additions & 1 deletion redisvl/index/index.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should let them control the batching parameter but defaulted to 500?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if someone's goal is to just clear all of the data from an index, we should implement the best practice under the hood and not make the user worry about it? But it certainly woudln't be hard to add an optional arg in the future if folks need it in the future!

Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from redis.commands.search.indexDefinition import IndexDefinition

from redisvl.index.storage import HashStorage, JsonStorage
from redisvl.query.query import BaseQuery, CountQuery, FilterQuery
from redisvl.query import BaseQuery, CountQuery, FilterQuery
from redisvl.query.filter import FilterExpression
from redisvl.redis.connection import (
RedisConnectionFactory,
convert_index_info_to_schema,
Expand Down Expand Up @@ -476,6 +477,26 @@ def delete(self, drop: bool = True):
except:
logger.exception("Error while deleting index")

def clear(self) -> int:
"""Clear all keys in Redis associated with the index, leaving the index
available and in-place for future insertions or updates.

Returns:
int: Count of records deleted from Redis.
"""
# Track deleted records
total_records_deleted: int = 0

# Paginate using queries and delete in batches
for batch in self.paginate(
FilterQuery(FilterExpression("*"), return_fields=["id"]), page_size=500
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unlikely scenario, but what if I want to get the (some) data out before destroying the index, e.g. return_fields being configurable also?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with the notion that someone might want to be able to export data from an index. However, I think that should be it's own clear feature. For example, an index.export(file_path="file.json") method or something similar?

I think clear() should just be a simple method for clearing data out of your index

):
batch_keys = [record["id"] for record in batch]
self._redis_client.delete(*batch_keys) # type: ignore
total_records_deleted += len(batch_keys)

return total_records_deleted

def load(
self,
data: Iterable[Any],
Expand Down Expand Up @@ -894,6 +915,26 @@ async def delete(self, drop: bool = True):
logger.exception("Error while deleting index")
raise

async def clear(self) -> int:
"""Clear all keys in Redis associated with the index, leaving the index
available and in-place for future insertions or updates.

Returns:
int: Count of records deleted from Redis.
"""
# Track deleted records
total_records_deleted: int = 0

# Paginate using queries and delete in batches
async for batch in self.paginate(
FilterQuery(FilterExpression("*"), return_fields=["id"]), page_size=500
):
batch_keys = [record["id"] for record in batch]
await self._redis_client.delete(*batch_keys) # type: ignore
total_records_deleted += len(batch_keys)

return total_records_deleted

async def load(
self,
data: Iterable[Any],
Expand Down
10 changes: 8 additions & 2 deletions redisvl/query/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
from redisvl.query.query import CountQuery, FilterQuery, RangeQuery, VectorQuery
from redisvl.query.query import (
BaseQuery,
CountQuery,
FilterQuery,
RangeQuery,
VectorQuery,
)

__all__ = ["VectorQuery", "FilterQuery", "RangeQuery", "CountQuery"]
__all__ = ["BaseQuery", "VectorQuery", "FilterQuery", "RangeQuery", "CountQuery"]
2 changes: 1 addition & 1 deletion schemas/schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ index:
fields:
- name: user
type: tag
path: '.user'
path: '$.user'
- name: credit_score
type: tag
path: '$.credit_score'
Expand Down
7 changes: 7 additions & 0 deletions tests/integration/test_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,11 @@ def hash_preprocess(item: dict) -> dict:
for field in return_fields:
assert getattr(doc1, field) == doc2[field]

count_deleted_keys = index.clear()
assert count_deleted_keys == len(sample_data)

assert index.exists() == True

index.delete()

assert index.exists() == False
7 changes: 7 additions & 0 deletions tests/integration/test_flow_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,4 +93,11 @@ async def hash_preprocess(item: dict) -> dict:
for field in return_fields:
assert getattr(doc1, field) == doc2[field]

count_deleted_keys = await index.clear()
assert count_deleted_keys == len(sample_data)

assert await index.exists() == True

await index.delete()

assert await index.exists() == False
8 changes: 6 additions & 2 deletions tests/integration/test_vectorizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def skip_vectorizer() -> bool:
VertexAITextVectorizer,
CohereTextVectorizer,
AzureOpenAITextVectorizer,
MistralAITextVectorizer,
# MistralAITextVectorizer,
CustomTextVectorizer,
]
)
Expand Down Expand Up @@ -218,7 +218,11 @@ def bad_return_type(text: str) -> str:


@pytest.fixture(
params=[OpenAITextVectorizer, MistralAITextVectorizer, CustomTextVectorizer]
params=[
OpenAITextVectorizer,
# MistralAITextVectorizer,
CustomTextVectorizer,
]
)
def avectorizer(request, skip_vectorizer):
if skip_vectorizer:
Expand Down
Loading