Merge branch 'main' of https://github.com/activeloopai/deeplake into …

…mmsegmentation
activeloopai · Jun 17, 2024 · f497d9d · f497d9d
2 parents 55282a8 + 8b27518
commit f497d9d
Show file tree

Hide file tree

Showing 9 changed files with 964 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ Deep Lake is a Database for AI powered by a storage format optimized for deep-le
 1. Storing data and vectors while building LLM applications
 2. Managing datasets while training deep learning models
 
-Deep Lake simplifies the deployment of enterprise-grade LLM-based products by offering storage for all data types (embeddings, audio, text, videos, images, pdfs, annotations, etc.), querying and vector search, data streaming while training models at scale, data versioning and lineage, and integrations with popular tools such as LangChain, LlamaIndex, Weights & Biases, and many more. Deep Lake works with data of any size, it is serverless, and it enables you to store all of your data in your own cloud and in one place. Deep Lake is used by Intel, Bayer Radiology, Matterport, ZERO Systems, Red Cross, Yale, & Oxford. 
+Deep Lake simplifies the deployment of enterprise-grade LLM-based products by offering storage for all data types (embeddings, audio, text, videos, images, dicom, pdfs, annotations, [and more](https://docs.deeplake.ai/en/latest/Htypes.html)), querying and vector search, data streaming while training models at scale, data versioning and lineage, and integrations with popular tools such as LangChain, LlamaIndex, Weights & Biases, and many more. Deep Lake works with data of any size, it is serverless, and it enables you to store all of your data in your own cloud and in one place. Deep Lake is used by Intel, Bayer Radiology, Matterport, ZERO Systems, Red Cross, Yale, & Oxford. 
 
 ### Deep Lake includes the following features:
 
@@ -95,19 +95,20 @@ pip3 install deeplake
 
 ### Vector Store Applications
 Using Deep Lake as a Vector Store for building LLM applications:
-### - [Vector Store Quickstart](https://docs.activeloop.ai/quickstart)
-### - [Vector Store Getting Started Guide](https://docs.activeloop.ai/getting-started/vector-store)
-### - [Using Deep Lake with LangChain](https://docs.activeloop.ai/tutorials/vector-store/deep-lake-vector-store-in-langchain)
-### - [Image Similarity Search with Deep Lake](https://docs.activeloop.ai/tutorials/vector-store/image-similarity-search)
+### - [Vector Store Quickstart](https://docs.activeloop.ai/examples/rag/quickstart)
+### - [Vector Store Tutorials](https://docs.activeloop.ai/examples/rag/tutorials)
+### - [LangChain Integration](https://docs.activeloop.ai/examples/rag/langchain-integration)
+### - [LlamaIndex Integration](https://docs.activeloop.ai/examples/rag/llamaindex-integration)
+### - [Image Similarity Search with Deep Lake](https://docs.activeloop.ai/examples/rag/tutorials/image-similarity-search)
 
 
 ### Deep Learning Applications
 Using Deep Lake for managing data while training Deep Learning models:
-### - [Deep Learning Quickstart](https://docs.activeloop.ai/quickstart-dl)
-### - [Deep Learning Getting Started Guide](https://docs.activeloop.ai/getting-started/deep-learning)
-### - [Tutorials for Training Models](https://docs.activeloop.ai/tutorials/deep-learning/training-models)
-### - [Tutorials for Creating Deep Learning Datasets](https://docs.activeloop.ai/tutorials/deep-learning/creating-datasets)
-### - [Deep Learning Playbooks](https://docs.activeloop.ai/playbooks/evaluating-model-performance)
+### - [Deep Learning Quickstart](https://docs.activeloop.ai/examples/dl/quickstart)
+### - [Deep Learning Getting Started Guide](https://docs.activeloop.ai/examples/dl/guide)
+### - [Tutorials for Training Models](https://docs.activeloop.ai/examples/dl/tutorials/training-models)
+### - [Tutorials for Creating Deep Learning Datasets](https://docs.activeloop.ai/examples/dl/tutorials/creating-datasets)
+### - [Deep Learning Playbooks](https://docs.activeloop.ai/examples/dl/playbooks)
 
 ## ⚙️ Integrations
 

diff --git a/deeplake/constants.py b/deeplake/constants.py
@@ -333,6 +333,7 @@
     "additional_params": {
         "efConstruction": 600,
         "M": 32,
+        "partitions": 1,
     },
 }
 VECTORSTORE_EXTEND_BATCH_SIZE = 500

diff --git a/deeplake/core/dataset/indra_tensor_view.py b/deeplake/core/dataset/indra_tensor_view.py
@@ -62,21 +62,33 @@ def numpy(
 
     def text(self, fetch_chunks: bool = False):
         """Return text data. Only applicable for tensors with 'text' base htype."""
-        bs = self.indra_tensor.bytes()
-        if self.ndim == 1:
-            return bs.decode()
-        if isinstance(bs, bytes):
-            return [bs.decode()]
-        return list(b.decode() for b in bs)
+        try:
+            bs = self.indra_tensor.bytes()
+            if self.ndim == 1:
+                return bs.decode()
+            if isinstance(bs, bytes):
+                return [bs.decode()]
+            return list(b.decode() for b in bs)
+        except Exception as e:
+            bs = self.indra_tensor.numpy(aslist=True)
+            if self.ndim == 1:
+                return bs[0]
+            return list(b[0] for b in bs)
 
     def dict(self, fetch_chunks: bool = False):
         """Return json data. Only applicable for tensors with 'json' base htype."""
-        bs = self.indra_tensor.bytes()
-        if self.ndim == 1:
-            return json.loads(bs.decode())
-        if isinstance(bs, bytes):
-            return [json.loads(bs.decode())]
-        return list(json.loads(b.decode()) for b in self.indra_tensor.bytes())
+        try:
+            bs = self.indra_tensor.bytes()
+            if self.ndim == 1:
+                return json.loads(bs.decode())
+            if isinstance(bs, bytes):
+                return [json.loads(bs.decode())]
+            return list(json.loads(b.decode()) for b in self.indra_tensor.bytes())
+        except Exception as e:
+            bs = self.indra_tensor.numpy(aslist=True)
+            if self.ndim == 1:
+                return bs[0]
+            return list(b[0] for b in bs)
 
     @property
     def dtype(self):

diff --git a/deeplake/core/index_maintenance.py b/deeplake/core/index_maintenance.py
@@ -62,6 +62,21 @@ def index_exists(dataset):
         return False
 
 
+def index_partition_count(dataset):
+    emb_tensor = fetch_embedding_tensor(dataset)
+    if emb_tensor is not None:
+        vdb_indexes = emb_tensor.fetch_vdb_indexes()
+        if len(vdb_indexes) == 0:
+            return 1
+        else:
+            additional_params = vdb_indexes[0].get("additional_params", {})
+            if additional_params is None:
+                return 1
+            return additional_params.get("partitions", 1)
+    else:
+        return 1
+
+
 def index_used(exec_option):
     """Check if the index is used for the exec_option"""
     return exec_option in ("tensor_db", "compute_engine")
@@ -101,8 +116,17 @@ def check_index_params(self):
 
     existing_distance = existing_params.get("distance", "COS")
     if curr_distance == existing_distance:
-        current_additional_params_dict = current_params.get("additional_params", None)
-        existing_additional_params_dict = existing_params.get("additional_params", None)
+        current_additional_params_dict = current_params.get(
+            "additional_params", {}
+        ).copy()
+        existing_additional_params_dict = existing_params.get(
+            "additional_params", {}
+        ).copy()
+
+        # Remove the 'partitions' key from the copies of the dictionaries
+        current_additional_params_dict.pop("partitions", None)
+        existing_additional_params_dict.pop("partitions", None)
+
         if current_additional_params_dict == existing_additional_params_dict:
             return True
 
@@ -134,9 +158,9 @@ def get_index_metric(metric):
 
 
 def normalize_additional_params(params: dict) -> dict:
-    mapping = {"efconstruction": "efConstruction", "m": "M"}
+    mapping = {"efconstruction": "efConstruction", "m": "M", "partitions": "partitions"}
 
-    allowed_keys = ["efConstruction", "m"]
+    allowed_keys = ["efConstruction", "m", "partitions"]
 
     # New dictionary to store the result with desired key format
     result_dict = {}
@@ -180,7 +204,9 @@ def check_vdb_indexes(dataset):
     return False
 
 
-def _incr_maintenance_vdb_indexes(tensor, indexes, index_operation):
+def _incr_maintenance_vdb_indexes(
+    tensor, indexes, index_operation, is_partitioned=False
+):
     try:
         is_embedding = tensor.htype == "embedding"
         has_vdb_indexes = hasattr(tensor.meta, "vdb_indexes")
@@ -194,6 +220,7 @@ def _incr_maintenance_vdb_indexes(tensor, indexes, index_operation):
                 tensor.update_vdb_index(
                     operation_kind=index_operation,
                     row_ids=indexes,
+                    is_partitioned=is_partitioned,
                 )
     except Exception as e:
         raise Exception(f"An error occurred while regenerating VDB indexes: {e}")
@@ -247,7 +274,6 @@ def index_operation_dataset(self, dml_type, rowids):
 
     if index_operation_type == INDEX_OP_TYPE.NOOP:
         return
-
     if (
         index_operation_type == INDEX_OP_TYPE.CREATE_INDEX
         or index_operation_type == INDEX_OP_TYPE.REGENERATE_INDEX
@@ -272,6 +298,13 @@ def index_operation_dataset(self, dml_type, rowids):
         else:
             emb_tensor.create_vdb_index("hnsw_1", distance=distance)
     elif index_operation_type == INDEX_OP_TYPE.INCREMENTAL_INDEX:
-        _incr_maintenance_vdb_indexes(emb_tensor, rowids, dml_type)
+        partition_count = index_partition_count(self)
+        print(f"Partition count: {partition_count}")
+        if partition_count > 1:
+            _incr_maintenance_vdb_indexes(
+                emb_tensor, rowids, dml_type, is_partitioned=True
+            )
+        else:
+            _incr_maintenance_vdb_indexes(emb_tensor, rowids, dml_type)
     else:
         raise Exception("Unknown index operation")
diff --git a/deeplake/core/meta/tensor_meta.py b/deeplake/core/meta/tensor_meta.py
@@ -99,6 +99,18 @@ def get_vdb_index_ids(self):
             index_ids.append(index["id"])
         return index_ids
 
+    def update_vdb_partition(self, id: str, partition: int):
+        if not self.contains_vdb_index(id):
+            raise ValueError(f"Tensor meta has no vdb index with name '{id}'.")
+        for index in self.vdb_indexes:
+            if id == index["id"]:
+                additional_param = index["additional_params"]
+                if not isinstance(additional_param, dict):
+                    raise ValueError("additional_params must be a dictionary")
+                additional_param["partitions"] = partition
+                self.is_dirty = True
+                return
+
     def add_vdb_index(self, id: str, type: str, distance: str, **kwargs):
         if self.contains_vdb_index(id):
             raise ValueError(f"Tensor meta already has a vdb index with name '{id}'.")