Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/activeloopai/deeplake into …
Browse files Browse the repository at this point in the history
…mmsegmentation
  • Loading branch information
activesoull committed Jun 17, 2024
2 parents 55282a8 + 8b27518 commit f497d9d
Show file tree
Hide file tree
Showing 9 changed files with 964 additions and 36 deletions.
21 changes: 11 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Deep Lake is a Database for AI powered by a storage format optimized for deep-le
1. Storing data and vectors while building LLM applications
2. Managing datasets while training deep learning models

Deep Lake simplifies the deployment of enterprise-grade LLM-based products by offering storage for all data types (embeddings, audio, text, videos, images, pdfs, annotations, etc.), querying and vector search, data streaming while training models at scale, data versioning and lineage, and integrations with popular tools such as LangChain, LlamaIndex, Weights & Biases, and many more. Deep Lake works with data of any size, it is serverless, and it enables you to store all of your data in your own cloud and in one place. Deep Lake is used by Intel, Bayer Radiology, Matterport, ZERO Systems, Red Cross, Yale, & Oxford.
Deep Lake simplifies the deployment of enterprise-grade LLM-based products by offering storage for all data types (embeddings, audio, text, videos, images, dicom, pdfs, annotations, [and more](https://docs.deeplake.ai/en/latest/Htypes.html)), querying and vector search, data streaming while training models at scale, data versioning and lineage, and integrations with popular tools such as LangChain, LlamaIndex, Weights & Biases, and many more. Deep Lake works with data of any size, it is serverless, and it enables you to store all of your data in your own cloud and in one place. Deep Lake is used by Intel, Bayer Radiology, Matterport, ZERO Systems, Red Cross, Yale, & Oxford.

### Deep Lake includes the following features:

Expand Down Expand Up @@ -95,19 +95,20 @@ pip3 install deeplake

### Vector Store Applications
Using Deep Lake as a Vector Store for building LLM applications:
### - [Vector Store Quickstart](https://docs.activeloop.ai/quickstart)
### - [Vector Store Getting Started Guide](https://docs.activeloop.ai/getting-started/vector-store)
### - [Using Deep Lake with LangChain](https://docs.activeloop.ai/tutorials/vector-store/deep-lake-vector-store-in-langchain)
### - [Image Similarity Search with Deep Lake](https://docs.activeloop.ai/tutorials/vector-store/image-similarity-search)
### - [Vector Store Quickstart](https://docs.activeloop.ai/examples/rag/quickstart)
### - [Vector Store Tutorials](https://docs.activeloop.ai/examples/rag/tutorials)
### - [LangChain Integration](https://docs.activeloop.ai/examples/rag/langchain-integration)
### - [LlamaIndex Integration](https://docs.activeloop.ai/examples/rag/llamaindex-integration)
### - [Image Similarity Search with Deep Lake](https://docs.activeloop.ai/examples/rag/tutorials/image-similarity-search)


### Deep Learning Applications
Using Deep Lake for managing data while training Deep Learning models:
### - [Deep Learning Quickstart](https://docs.activeloop.ai/quickstart-dl)
### - [Deep Learning Getting Started Guide](https://docs.activeloop.ai/getting-started/deep-learning)
### - [Tutorials for Training Models](https://docs.activeloop.ai/tutorials/deep-learning/training-models)
### - [Tutorials for Creating Deep Learning Datasets](https://docs.activeloop.ai/tutorials/deep-learning/creating-datasets)
### - [Deep Learning Playbooks](https://docs.activeloop.ai/playbooks/evaluating-model-performance)
### - [Deep Learning Quickstart](https://docs.activeloop.ai/examples/dl/quickstart)
### - [Deep Learning Getting Started Guide](https://docs.activeloop.ai/examples/dl/guide)
### - [Tutorials for Training Models](https://docs.activeloop.ai/examples/dl/tutorials/training-models)
### - [Tutorials for Creating Deep Learning Datasets](https://docs.activeloop.ai/examples/dl/tutorials/creating-datasets)
### - [Deep Learning Playbooks](https://docs.activeloop.ai/examples/dl/playbooks)

## ⚙️ Integrations

Expand Down
1 change: 1 addition & 0 deletions deeplake/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@
"additional_params": {
"efConstruction": 600,
"M": 32,
"partitions": 1,
},
}
VECTORSTORE_EXTEND_BATCH_SIZE = 500
Expand Down
36 changes: 24 additions & 12 deletions deeplake/core/dataset/indra_tensor_view.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,33 @@ def numpy(

def text(self, fetch_chunks: bool = False):
"""Return text data. Only applicable for tensors with 'text' base htype."""
bs = self.indra_tensor.bytes()
if self.ndim == 1:
return bs.decode()
if isinstance(bs, bytes):
return [bs.decode()]
return list(b.decode() for b in bs)
try:
bs = self.indra_tensor.bytes()
if self.ndim == 1:
return bs.decode()
if isinstance(bs, bytes):
return [bs.decode()]
return list(b.decode() for b in bs)
except Exception as e:
bs = self.indra_tensor.numpy(aslist=True)
if self.ndim == 1:
return bs[0]
return list(b[0] for b in bs)

def dict(self, fetch_chunks: bool = False):
"""Return json data. Only applicable for tensors with 'json' base htype."""
bs = self.indra_tensor.bytes()
if self.ndim == 1:
return json.loads(bs.decode())
if isinstance(bs, bytes):
return [json.loads(bs.decode())]
return list(json.loads(b.decode()) for b in self.indra_tensor.bytes())
try:
bs = self.indra_tensor.bytes()
if self.ndim == 1:
return json.loads(bs.decode())
if isinstance(bs, bytes):
return [json.loads(bs.decode())]
return list(json.loads(b.decode()) for b in self.indra_tensor.bytes())
except Exception as e:
bs = self.indra_tensor.numpy(aslist=True)
if self.ndim == 1:
return bs[0]
return list(b[0] for b in bs)

@property
def dtype(self):
Expand Down
47 changes: 40 additions & 7 deletions deeplake/core/index_maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,21 @@ def index_exists(dataset):
return False


def index_partition_count(dataset):
emb_tensor = fetch_embedding_tensor(dataset)
if emb_tensor is not None:
vdb_indexes = emb_tensor.fetch_vdb_indexes()
if len(vdb_indexes) == 0:
return 1
else:
additional_params = vdb_indexes[0].get("additional_params", {})
if additional_params is None:
return 1
return additional_params.get("partitions", 1)
else:
return 1


def index_used(exec_option):
"""Check if the index is used for the exec_option"""
return exec_option in ("tensor_db", "compute_engine")
Expand Down Expand Up @@ -101,8 +116,17 @@ def check_index_params(self):

existing_distance = existing_params.get("distance", "COS")
if curr_distance == existing_distance:
current_additional_params_dict = current_params.get("additional_params", None)
existing_additional_params_dict = existing_params.get("additional_params", None)
current_additional_params_dict = current_params.get(
"additional_params", {}
).copy()
existing_additional_params_dict = existing_params.get(
"additional_params", {}
).copy()

# Remove the 'partitions' key from the copies of the dictionaries
current_additional_params_dict.pop("partitions", None)
existing_additional_params_dict.pop("partitions", None)

if current_additional_params_dict == existing_additional_params_dict:
return True

Expand Down Expand Up @@ -134,9 +158,9 @@ def get_index_metric(metric):


def normalize_additional_params(params: dict) -> dict:
mapping = {"efconstruction": "efConstruction", "m": "M"}
mapping = {"efconstruction": "efConstruction", "m": "M", "partitions": "partitions"}

allowed_keys = ["efConstruction", "m"]
allowed_keys = ["efConstruction", "m", "partitions"]

# New dictionary to store the result with desired key format
result_dict = {}
Expand Down Expand Up @@ -180,7 +204,9 @@ def check_vdb_indexes(dataset):
return False


def _incr_maintenance_vdb_indexes(tensor, indexes, index_operation):
def _incr_maintenance_vdb_indexes(
tensor, indexes, index_operation, is_partitioned=False
):
try:
is_embedding = tensor.htype == "embedding"
has_vdb_indexes = hasattr(tensor.meta, "vdb_indexes")
Expand All @@ -194,6 +220,7 @@ def _incr_maintenance_vdb_indexes(tensor, indexes, index_operation):
tensor.update_vdb_index(
operation_kind=index_operation,
row_ids=indexes,
is_partitioned=is_partitioned,
)
except Exception as e:
raise Exception(f"An error occurred while regenerating VDB indexes: {e}")
Expand Down Expand Up @@ -247,7 +274,6 @@ def index_operation_dataset(self, dml_type, rowids):

if index_operation_type == INDEX_OP_TYPE.NOOP:
return

if (
index_operation_type == INDEX_OP_TYPE.CREATE_INDEX
or index_operation_type == INDEX_OP_TYPE.REGENERATE_INDEX
Expand All @@ -272,6 +298,13 @@ def index_operation_dataset(self, dml_type, rowids):
else:
emb_tensor.create_vdb_index("hnsw_1", distance=distance)
elif index_operation_type == INDEX_OP_TYPE.INCREMENTAL_INDEX:
_incr_maintenance_vdb_indexes(emb_tensor, rowids, dml_type)
partition_count = index_partition_count(self)
print(f"Partition count: {partition_count}")
if partition_count > 1:
_incr_maintenance_vdb_indexes(
emb_tensor, rowids, dml_type, is_partitioned=True
)
else:
_incr_maintenance_vdb_indexes(emb_tensor, rowids, dml_type)
else:
raise Exception("Unknown index operation")
12 changes: 12 additions & 0 deletions deeplake/core/meta/tensor_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,18 @@ def get_vdb_index_ids(self):
index_ids.append(index["id"])
return index_ids

def update_vdb_partition(self, id: str, partition: int):
if not self.contains_vdb_index(id):
raise ValueError(f"Tensor meta has no vdb index with name '{id}'.")
for index in self.vdb_indexes:
if id == index["id"]:
additional_param = index["additional_params"]
if not isinstance(additional_param, dict):
raise ValueError("additional_params must be a dictionary")
additional_param["partitions"] = partition
self.is_dirty = True
return

def add_vdb_index(self, id: str, type: str, distance: str, **kwargs):
if self.contains_vdb_index(id):
raise ValueError(f"Tensor meta already has a vdb index with name '{id}'.")
Expand Down
Loading

0 comments on commit f497d9d

Please sign in to comment.