diff --git a/python/python/tests/test_fragment.py b/python/python/tests/test_fragment.py index d6ef9ca090..dcdf02be09 100644 --- a/python/python/tests/test_fragment.py +++ b/python/python/tests/test_fragment.py @@ -270,3 +270,29 @@ def test_fragment_v2(tmp_path): assert len(fragments) == 1 ds = lance.dataset(dataset_uri) assert "minor_version: 3" in format_fragment(fragments[0], ds) + + +def test_mixed_fragment_versions(tmp_path): + data = pa.table({"a": range(800), "b": range(800)}) + + # Create empty v2 dataset + ds = lance.write_dataset( + data_obj=[], + uri=tmp_path / "dataset2", + schema=data.schema, + data_storage_version="stable", + ) + + # Add one v1 file and one v2 file + fragments = [] + fragments.append( + lance.LanceFragment.create(ds.uri, data, data_storage_version="legacy") + ) + fragments.append( + lance.LanceFragment.create(ds.uri, data, data_storage_version="stable") + ) + + # Attempt to commit + operation = lance.LanceOperation.Overwrite(ds.schema, fragments) + with pytest.raises(OSError, match="All data files must have the same version"): + lance.LanceDataset.commit(ds.uri, operation) diff --git a/python/python/tests/test_migration.py b/python/python/tests/test_migration.py new file mode 100644 index 0000000000..3988245f4e --- /dev/null +++ b/python/python/tests/test_migration.py @@ -0,0 +1,39 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + +import shutil +from pathlib import Path + +import lance +import pyarrow as pa + + +def prep_dataset(tmp_path: Path, version, name: str): + dataset_dir = ( + Path(__file__).parent.parent.parent.parent / "test_data" / version / name + ) + shutil.copytree(dataset_dir, tmp_path / version / name) + return lance.dataset(tmp_path / version / name) + + +def test_add_data_storage_version(tmp_path: Path): + """ + In version 0.15 and below we did not have a data storage version. We had + writer flags that were used to determine if we should use the old or new + storage format. In version 0.16 we added a data storage version to the + manifest that should be correctly populated from existing files and/or the + writer flags + """ + tab = pa.table({"x": range(1024)}) + + def check_dataset(dataset_name: str, expected_version: str): + ds = prep_dataset(tmp_path, "v0.15.0", dataset_name) + assert ds.data_storage_version == expected_version + + lance.write_dataset(tab, ds.uri, mode="append") + assert ds.data_storage_version == expected_version + + check_dataset("v1_no_files.lance", "0.1") + check_dataset("v2_no_files.lance", "2.0") + check_dataset("v1_with_files.lance", "0.1") + check_dataset("v2_with_files.lance", "2.0") diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index 98ef30319c..e1668e6d6f 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -313,3 +313,49 @@ def test_dataset_distributed_optimize(tmp_path: Path): assert metrics.fragments_added == 1 # Compaction occurs in two transactions so it increments the version by 2. assert dataset.version == 3 + + +def test_migration_via_fragment_apis(tmp_path): + """ + This test is a regression of a case where we were using fragment APIs to migrate + from v1 to v2 but that left the dataset in a state where it had v2 files but wasn't + marked with the v2 writer flag. + """ + data = pa.table({"a": range(800), "b": range(800)}) + + # Create v1 dataset + ds = lance.write_dataset( + data, tmp_path / "dataset", max_rows_per_file=200, data_storage_version="legacy" + ) + + # Create empty v2 dataset + lance.write_dataset( + data_obj=[], + uri=tmp_path / "dataset2", + schema=ds.schema, + data_storage_version="stable", + ) + + # Add v2 files + fragments = [] + for frag in ds.get_fragments(): + reader = ds.scanner(fragments=[frag]) + fragments.append( + lance.LanceFragment.create( + dataset_uri=tmp_path / "dataset2", + data=reader, + fragment_id=frag.fragment_id, + data_storage_version="stable", + ) + ) + + # Commit + operation = lance.LanceOperation.Overwrite(ds.schema, fragments) + ds2 = lance.LanceDataset.commit(tmp_path / "dataset2", operation) + + # Compact, dataset should still be v2 + ds2.optimize.compact_files() + + ds2 = lance.dataset(tmp_path / "dataset2") + + assert ds2.data_storage_version == "2.0" diff --git a/rust/lance-encoding/src/version.rs b/rust/lance-encoding/src/version.rs index 312c7ba41b..ad905c393a 100644 --- a/rust/lance-encoding/src/version.rs +++ b/rust/lance-encoding/src/version.rs @@ -41,6 +41,20 @@ impl LanceFileVersion { // This will go away soon, but there are a few spots where the Legacy default doesn't make sense Self::V2_0 } + + pub fn try_from_major_minor(major: u32, minor: u32) -> Result { + match (major, minor) { + (0, 0) => Ok(Self::Legacy), + (0, 1) => Ok(Self::Legacy), + (0, 3) => Ok(Self::V2_0), + (2, 0) => Ok(Self::V2_0), + (2, 1) => Ok(Self::V2_1), + _ => Err(Error::InvalidInput { + source: format!("Unknown Lance storage version: {}.{}", major, minor).into(), + location: location!(), + }), + } + } } impl std::fmt::Display for LanceFileVersion { diff --git a/rust/lance-table/src/format/fragment.rs b/rust/lance-table/src/format/fragment.rs index f182da7268..c4b2f97a2f 100644 --- a/rust/lance-table/src/format/fragment.rs +++ b/rust/lance-table/src/format/fragment.rs @@ -3,6 +3,7 @@ use lance_core::Error; use lance_file::format::{MAJOR_VERSION, MINOR_VERSION_NEXT}; +use lance_file::version::LanceFileVersion; use object_store::path::Path; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; @@ -301,6 +302,33 @@ impl Fragment { // If any file in a fragment is legacy then all files in the fragment must be self.files[0].is_legacy_file() } + + // Helper method to infer the Lance version from a set of fragments + pub fn try_infer_version(fragments: &[Self]) -> Result { + // Otherwise we need to check the actual file versions + // Determine version from first file + let sample_file = &fragments[0].files[0]; + let file_version = LanceFileVersion::try_from_major_minor( + sample_file.file_major_version, + sample_file.file_minor_version, + )?; + // Ensure all files match + for frag in fragments { + for file in &frag.files { + let this_file_version = LanceFileVersion::try_from_major_minor( + file.file_major_version, + file.file_minor_version, + )?; + if file_version != this_file_version { + return Err(Error::invalid_input( + "All data files must have the same version", + location!(), + )); + } + } + } + Ok(file_version) + } } impl TryFrom for Fragment { diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 174463ab46..0d7ee43dcd 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -436,10 +436,16 @@ impl TryFrom for Manifest { let data_storage_format = match p.data_format { None => { - if has_deprecated_v2_feature_flag(p.writer_feature_flags) { - DataStorageFormat::new(LanceFileVersion::V2_0) + if fragments.is_empty() { + // No fragments to inspect, best we can do is look at writer flags + if has_deprecated_v2_feature_flag(p.writer_feature_flags) { + DataStorageFormat::new(LanceFileVersion::Stable) + } else { + DataStorageFormat::new(LanceFileVersion::Legacy) + } } else { - DataStorageFormat::new(LanceFileVersion::Legacy) + // If there are fragments, they are a better indicator + DataStorageFormat::new(Fragment::try_infer_version(fragments.as_ref())?) } } Some(format) => DataStorageFormat::from(format), diff --git a/rust/lance/src/dataset/transaction.rs b/rust/lance/src/dataset/transaction.rs index 0946a04549..f2cb1ec324 100644 --- a/rust/lance/src/dataset/transaction.rs +++ b/rust/lance/src/dataset/transaction.rs @@ -329,6 +329,31 @@ impl Transaction { }) } + fn data_storage_format_from_files( + fragments: &[Fragment], + user_requested: Option, + ) -> Result { + // If no files use user-requested or default + if fragments.is_empty() { + return Ok(user_requested + .map(DataStorageFormat::new) + .unwrap_or_default()); + } + // Otherwise we need to check the actual file versions + let file_version = Fragment::try_infer_version(fragments)?; + + // Ensure user-requested matches data files + if let Some(user_requested) = user_requested { + if user_requested != file_version { + return Err(Error::invalid_input( + format!("User requested data storage version ({}) does not match version in data files ({})", user_requested, file_version), + location!(), + )); + } + } + Ok(DataStorageFormat::new(file_version)) + } + pub(crate) async fn restore_old_manifest( object_store: &ObjectStore, commit_handler: &dyn CommitHandler, @@ -568,21 +593,26 @@ impl Transaction { // If a fragment was reserved then it may not belong at the end of the fragments list. final_fragments.sort_by_key(|frag| frag.id); - let data_storage_format = match (&config.storage_format, config.use_legacy_format) { - (Some(storage_format), _) => storage_format.clone(), - (None, Some(true)) => DataStorageFormat::new(LanceFileVersion::Legacy), - (None, Some(false)) => DataStorageFormat::new(LanceFileVersion::V2_0), - (None, None) => DataStorageFormat::default(), + let user_requested_version = match (&config.storage_format, config.use_legacy_format) { + (Some(storage_format), _) => Some(storage_format.lance_file_version()?), + (None, Some(true)) => Some(LanceFileVersion::Legacy), + (None, Some(false)) => Some(LanceFileVersion::V2_0), + (None, None) => None, }; let mut manifest = if let Some(current_manifest) = current_manifest { let mut prev_manifest = Manifest::new_from_previous(current_manifest, schema, Arc::new(final_fragments)); if matches!(self.operation, Operation::Overwrite { .. }) { - prev_manifest.data_storage_format = data_storage_format; + prev_manifest.data_storage_format = Self::data_storage_format_from_files( + &prev_manifest.fragments, + user_requested_version, + )?; } prev_manifest } else { + let data_storage_format = + Self::data_storage_format_from_files(&final_fragments, user_requested_version)?; Manifest::new(schema, Arc::new(final_fragments), data_storage_format) }; diff --git a/test_data/v0.15.0/README.md b/test_data/v0.15.0/README.md new file mode 100644 index 0000000000..e7ba34e462 --- /dev/null +++ b/test_data/v0.15.0/README.md @@ -0,0 +1,4 @@ +# Historical Datasets + +This folder contains datasets from older versions of Lance that we use for backwards +compatibility testing. diff --git a/test_data/v0.15.0/v1_no_files.lance/_latest.manifest b/test_data/v0.15.0/v1_no_files.lance/_latest.manifest new file mode 100644 index 0000000000..058715267f Binary files /dev/null and b/test_data/v0.15.0/v1_no_files.lance/_latest.manifest differ diff --git a/test_data/v0.15.0/v1_no_files.lance/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn b/test_data/v0.15.0/v1_no_files.lance/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn new file mode 100644 index 0000000000..2a21a88ffe --- /dev/null +++ b/test_data/v0.15.0/v1_no_files.lance/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn @@ -0,0 +1 @@ +$4a3f63b9-e75a-42ed-8dad-b737ebd29e5f²x ÿÿÿÿÿÿÿÿÿ*int6408 \ No newline at end of file diff --git a/test_data/v0.15.0/v1_no_files.lance/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn b/test_data/v0.15.0/v1_no_files.lance/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn new file mode 100644 index 0000000000..8dfe0df9f6 --- /dev/null +++ b/test_data/v0.15.0/v1_no_files.lance/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn @@ -0,0 +1 @@ +$b2bbb126-958f-48ef-81f7-b33deac71663²x ÿÿÿÿÿÿÿÿÿ*int6408 \ No newline at end of file diff --git a/test_data/v0.15.0/v1_no_files.lance/_versions/1.manifest b/test_data/v0.15.0/v1_no_files.lance/_versions/1.manifest new file mode 100644 index 0000000000..058715267f Binary files /dev/null and b/test_data/v0.15.0/v1_no_files.lance/_versions/1.manifest differ diff --git a/test_data/v0.15.0/v1_with_files.lance/_latest.manifest b/test_data/v0.15.0/v1_with_files.lance/_latest.manifest new file mode 100644 index 0000000000..925075c03d Binary files /dev/null and b/test_data/v0.15.0/v1_with_files.lance/_latest.manifest differ diff --git a/test_data/v0.15.0/v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn b/test_data/v0.15.0/v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn new file mode 100644 index 0000000000..b156a4cfeb Binary files /dev/null and b/test_data/v0.15.0/v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn differ diff --git a/test_data/v0.15.0/v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn b/test_data/v0.15.0/v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn new file mode 100644 index 0000000000..6bee1e63f4 Binary files /dev/null and b/test_data/v0.15.0/v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn differ diff --git a/test_data/v0.15.0/v1_with_files.lance/_versions/1.manifest b/test_data/v0.15.0/v1_with_files.lance/_versions/1.manifest new file mode 100644 index 0000000000..925075c03d Binary files /dev/null and b/test_data/v0.15.0/v1_with_files.lance/_versions/1.manifest differ diff --git a/test_data/v0.15.0/v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance b/test_data/v0.15.0/v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance new file mode 100644 index 0000000000..c7e354cdbe Binary files /dev/null and b/test_data/v0.15.0/v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance differ diff --git a/test_data/v0.15.0/v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance b/test_data/v0.15.0/v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance new file mode 100644 index 0000000000..c7e354cdbe Binary files /dev/null and b/test_data/v0.15.0/v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance differ diff --git a/test_data/v0.15.0/v2_no_files.lance/_latest.manifest b/test_data/v0.15.0/v2_no_files.lance/_latest.manifest new file mode 100644 index 0000000000..cf387121da Binary files /dev/null and b/test_data/v0.15.0/v2_no_files.lance/_latest.manifest differ diff --git a/test_data/v0.15.0/v2_no_files.lance/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn b/test_data/v0.15.0/v2_no_files.lance/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn new file mode 100644 index 0000000000..03ef7a7b5f --- /dev/null +++ b/test_data/v0.15.0/v2_no_files.lance/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn @@ -0,0 +1 @@ +$131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4²x ÿÿÿÿÿÿÿÿÿ*int6408 \ No newline at end of file diff --git a/test_data/v0.15.0/v2_no_files.lance/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn b/test_data/v0.15.0/v2_no_files.lance/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn new file mode 100644 index 0000000000..753d3da8b8 --- /dev/null +++ b/test_data/v0.15.0/v2_no_files.lance/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn @@ -0,0 +1 @@ +$ffe96c42-4ab0-4b8a-845b-31faa8b24e4d²x ÿÿÿÿÿÿÿÿÿ*int6408 \ No newline at end of file diff --git a/test_data/v0.15.0/v2_no_files.lance/_versions/1.manifest b/test_data/v0.15.0/v2_no_files.lance/_versions/1.manifest new file mode 100644 index 0000000000..cf387121da Binary files /dev/null and b/test_data/v0.15.0/v2_no_files.lance/_versions/1.manifest differ diff --git a/test_data/v0.15.0/v2_with_files.lance/_latest.manifest b/test_data/v0.15.0/v2_with_files.lance/_latest.manifest new file mode 100644 index 0000000000..debde5af1b Binary files /dev/null and b/test_data/v0.15.0/v2_with_files.lance/_latest.manifest differ diff --git a/test_data/v0.15.0/v2_with_files.lance/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn b/test_data/v0.15.0/v2_with_files.lance/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn new file mode 100644 index 0000000000..1842974c45 Binary files /dev/null and b/test_data/v0.15.0/v2_with_files.lance/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn differ diff --git a/test_data/v0.15.0/v2_with_files.lance/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn b/test_data/v0.15.0/v2_with_files.lance/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn new file mode 100644 index 0000000000..83e3847c64 Binary files /dev/null and b/test_data/v0.15.0/v2_with_files.lance/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn differ diff --git a/test_data/v0.15.0/v2_with_files.lance/_versions/1.manifest b/test_data/v0.15.0/v2_with_files.lance/_versions/1.manifest new file mode 100644 index 0000000000..debde5af1b Binary files /dev/null and b/test_data/v0.15.0/v2_with_files.lance/_versions/1.manifest differ diff --git a/test_data/v0.15.0/v2_with_files.lance/data/38639abd-798b-4005-87a1-39d7a151ee30.lance b/test_data/v0.15.0/v2_with_files.lance/data/38639abd-798b-4005-87a1-39d7a151ee30.lance new file mode 100644 index 0000000000..8297828dd5 Binary files /dev/null and b/test_data/v0.15.0/v2_with_files.lance/data/38639abd-798b-4005-87a1-39d7a151ee30.lance differ diff --git a/test_data/v0.15.0/v2_with_files.lance/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance b/test_data/v0.15.0/v2_with_files.lance/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance new file mode 100644 index 0000000000..c7e354cdbe Binary files /dev/null and b/test_data/v0.15.0/v2_with_files.lance/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance differ