lancedb · westonpace · Aug 19, 2024 · Jun 19, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/python/python/benchmarks/test_index.py b/python/python/benchmarks/test_index.py
@@ -187,11 +187,13 @@ def test_transform_vectors_with_precomputed_parts(
 
 @pytest.mark.benchmark(group="shuffle_vectors")
 def test_shuffle_vectors(test_large_dataset, tmpdir, benchmark):
-    ivf = rand_ivf(test_dataset)
-    pq = rand_pq(test_dataset, ivf)
-    builder = IndicesBuilder(test_dataset, "vector")
+    ivf = rand_ivf(test_large_dataset)
+    pq = rand_pq(test_large_dataset, ivf)
+    builder = IndicesBuilder(test_large_dataset, "vector")
     transformed_uri = str(tmpdir / "output.lance")
-    builder.transform_vectors(ivf, pq, transformed_uri)
+    part_ids_path = str(tmpdir / "part_ids")
+    gen_rand_part_ids(test_large_dataset, part_ids_path)
+    builder.transform_vectors(ivf, pq, transformed_uri, None, part_ids_path)
     shuffle_out = str(tmpdir)
     benchmark.pedantic(
         builder.shuffle_transformed_vectors,

diff --git a/python/src/indices.rs b/python/src/indices.rs
@@ -6,7 +6,7 @@ use arrow_array::{Array, FixedSizeListArray};
 use arrow_data::ArrayData;
 use lance::index::vector::ivf::builder::write_vector_storage;
 use lance::io::ObjectStore;
-use lance_index::vector::ivf::shuffler::{load_partitioned_shuffles, shuffle_vectors};
+use lance_index::vector::ivf::shuffler::{shuffle_vectors, IvfShuffler};
 use lance_index::vector::{
     ivf::{storage::IvfModel, IvfBuildParams},
     pq::{PQBuildParams, ProductQuantizer},
@@ -300,7 +300,7 @@ async fn do_load_shuffled_vectors(
     pq_model: ProductQuantizer,
 ) -> PyResult<()> {
     let (_, path) = object_store_from_uri_or_path(dir_path).await?;
-    let streams = load_partitioned_shuffles(path.clone(), filenames)
+    let streams = IvfShuffler::load_partitioned_shuffles(&path, filenames)
         .await
         .infer_error()?;
 

diff --git a/rust/lance-file/src/v2/writer.rs b/rust/lance-file/src/v2/writer.rs
@@ -24,6 +24,7 @@ use prost::Message;
 use prost_types::Any;
 use snafu::{location, Location};
 use tokio::io::AsyncWriteExt;
+use tracing::instrument;
 
 use crate::datatypes::FieldsWithMeta;
 use crate::format::pb;
@@ -175,6 +176,7 @@ impl FileWriter {
         Ok(())
     }
 
+    #[instrument(skip_all, level = "debug")]
     async fn write_pages(
         &mut self,
         mut encoding_tasks: FuturesUnordered<EncodeTask>,
@@ -263,6 +265,30 @@ impl FileWriter {
         Ok(self.schema.as_ref().unwrap())
     }
 
+    #[instrument(skip_all, level = "debug")]
+    fn encode_batch(&mut self, batch: &RecordBatch) -> Result<Vec<Vec<EncodeTask>>> {
+        self.schema
+            .as_ref()
+            .unwrap()
+            .fields
+            .iter()
+            .zip(self.column_writers.iter_mut())
+            .map(|(field, column_writer)| {
+                let array = batch
+                    .column_by_name(&field.name)
+                    .ok_or(Error::InvalidInput {
+                        source: format!(
+                            "Cannot write batch.  The batch was missing the column `{}`",
+                            field.name
+                        )
+                        .into(),
+                        location: location!(),
+                    })?;
+                column_writer.maybe_encode(array.clone())
+            })
+            .collect::<Result<Vec<_>>>()
+    }
+
     /// Schedule a batch of data to be written to the file
     ///
     /// Note: the future returned by this method may complete before the data has been fully
@@ -273,7 +299,6 @@ impl FileWriter {
             batch.get_array_memory_size()
         );
         self.ensure_initialized(batch)?;
-        let schema = self.schema.as_ref().unwrap();
         let num_rows = batch.num_rows() as u64;
         if num_rows == 0 {
             return Ok(());
@@ -292,24 +317,7 @@ impl FileWriter {
         };
         // First we push each array into its column writer.  This may or may not generate enough
         // data to trigger an encoding task.  We collect any encoding tasks into a queue.
-        let encoding_tasks = schema
-            .fields
-            .iter()
-            .zip(self.column_writers.iter_mut())
-            .map(|(field, column_writer)| {
-                let array = batch
-                    .column_by_name(&field.name)
-                    .ok_or(Error::InvalidInput {
-                        source: format!(
-                            "Cannot write batch.  The batch was missing the column `{}`",
-                            field.name
-                        )
-                        .into(),
-                        location: location!(),
-                    })?;
-                column_writer.maybe_encode(array.clone())
-            })
-            .collect::<Result<Vec<_>>>()?;
+        let encoding_tasks = self.encode_batch(batch)?;
         let encoding_tasks = encoding_tasks
             .into_iter()
             .flatten()

diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
@@ -65,6 +65,7 @@ criterion.workspace = true
 lance-datagen.workspace = true
 lance-testing.workspace = true
 tempfile.workspace = true
+test-log.workspace = true
 datafusion-sql.workspace = true
 random_word = { version = "0.4.3", features = ["en"] }