From 98dc3a00c36b39ae512dc43971bfb43a7bc12648 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 19 Aug 2024 10:57:47 -0700 Subject: [PATCH 1/5] fix several situations where we were incorrectly inferring the dataset version --- python/python/tests/test_fragment.py | 26 ++++++++++++++ python/python/tests/test_optimize.py | 46 +++++++++++++++++++++++++ rust/lance-encoding/src/version.rs | 14 ++++++++ rust/lance-table/src/format/fragment.rs | 28 +++++++++++++++ rust/lance-table/src/format/manifest.rs | 12 +++++-- rust/lance/src/dataset/transaction.rs | 42 ++++++++++++++++++---- 6 files changed, 159 insertions(+), 9 deletions(-) diff --git a/python/python/tests/test_fragment.py b/python/python/tests/test_fragment.py index d6ef9ca090..dcdf02be09 100644 --- a/python/python/tests/test_fragment.py +++ b/python/python/tests/test_fragment.py @@ -270,3 +270,29 @@ def test_fragment_v2(tmp_path): assert len(fragments) == 1 ds = lance.dataset(dataset_uri) assert "minor_version: 3" in format_fragment(fragments[0], ds) + + +def test_mixed_fragment_versions(tmp_path): + data = pa.table({"a": range(800), "b": range(800)}) + + # Create empty v2 dataset + ds = lance.write_dataset( + data_obj=[], + uri=tmp_path / "dataset2", + schema=data.schema, + data_storage_version="stable", + ) + + # Add one v1 file and one v2 file + fragments = [] + fragments.append( + lance.LanceFragment.create(ds.uri, data, data_storage_version="legacy") + ) + fragments.append( + lance.LanceFragment.create(ds.uri, data, data_storage_version="stable") + ) + + # Attempt to commit + operation = lance.LanceOperation.Overwrite(ds.schema, fragments) + with pytest.raises(OSError, match="All data files must have the same version"): + lance.LanceDataset.commit(ds.uri, operation) diff --git a/python/python/tests/test_optimize.py b/python/python/tests/test_optimize.py index 98ef30319c..e1668e6d6f 100644 --- a/python/python/tests/test_optimize.py +++ b/python/python/tests/test_optimize.py @@ -313,3 +313,49 @@ def test_dataset_distributed_optimize(tmp_path: Path): assert metrics.fragments_added == 1 # Compaction occurs in two transactions so it increments the version by 2. assert dataset.version == 3 + + +def test_migration_via_fragment_apis(tmp_path): + """ + This test is a regression of a case where we were using fragment APIs to migrate + from v1 to v2 but that left the dataset in a state where it had v2 files but wasn't + marked with the v2 writer flag. + """ + data = pa.table({"a": range(800), "b": range(800)}) + + # Create v1 dataset + ds = lance.write_dataset( + data, tmp_path / "dataset", max_rows_per_file=200, data_storage_version="legacy" + ) + + # Create empty v2 dataset + lance.write_dataset( + data_obj=[], + uri=tmp_path / "dataset2", + schema=ds.schema, + data_storage_version="stable", + ) + + # Add v2 files + fragments = [] + for frag in ds.get_fragments(): + reader = ds.scanner(fragments=[frag]) + fragments.append( + lance.LanceFragment.create( + dataset_uri=tmp_path / "dataset2", + data=reader, + fragment_id=frag.fragment_id, + data_storage_version="stable", + ) + ) + + # Commit + operation = lance.LanceOperation.Overwrite(ds.schema, fragments) + ds2 = lance.LanceDataset.commit(tmp_path / "dataset2", operation) + + # Compact, dataset should still be v2 + ds2.optimize.compact_files() + + ds2 = lance.dataset(tmp_path / "dataset2") + + assert ds2.data_storage_version == "2.0" diff --git a/rust/lance-encoding/src/version.rs b/rust/lance-encoding/src/version.rs index 312c7ba41b..ad905c393a 100644 --- a/rust/lance-encoding/src/version.rs +++ b/rust/lance-encoding/src/version.rs @@ -41,6 +41,20 @@ impl LanceFileVersion { // This will go away soon, but there are a few spots where the Legacy default doesn't make sense Self::V2_0 } + + pub fn try_from_major_minor(major: u32, minor: u32) -> Result { + match (major, minor) { + (0, 0) => Ok(Self::Legacy), + (0, 1) => Ok(Self::Legacy), + (0, 3) => Ok(Self::V2_0), + (2, 0) => Ok(Self::V2_0), + (2, 1) => Ok(Self::V2_1), + _ => Err(Error::InvalidInput { + source: format!("Unknown Lance storage version: {}.{}", major, minor).into(), + location: location!(), + }), + } + } } impl std::fmt::Display for LanceFileVersion { diff --git a/rust/lance-table/src/format/fragment.rs b/rust/lance-table/src/format/fragment.rs index f182da7268..c4b2f97a2f 100644 --- a/rust/lance-table/src/format/fragment.rs +++ b/rust/lance-table/src/format/fragment.rs @@ -3,6 +3,7 @@ use lance_core::Error; use lance_file::format::{MAJOR_VERSION, MINOR_VERSION_NEXT}; +use lance_file::version::LanceFileVersion; use object_store::path::Path; use serde::{Deserialize, Serialize}; use snafu::{location, Location}; @@ -301,6 +302,33 @@ impl Fragment { // If any file in a fragment is legacy then all files in the fragment must be self.files[0].is_legacy_file() } + + // Helper method to infer the Lance version from a set of fragments + pub fn try_infer_version(fragments: &[Self]) -> Result { + // Otherwise we need to check the actual file versions + // Determine version from first file + let sample_file = &fragments[0].files[0]; + let file_version = LanceFileVersion::try_from_major_minor( + sample_file.file_major_version, + sample_file.file_minor_version, + )?; + // Ensure all files match + for frag in fragments { + for file in &frag.files { + let this_file_version = LanceFileVersion::try_from_major_minor( + file.file_major_version, + file.file_minor_version, + )?; + if file_version != this_file_version { + return Err(Error::invalid_input( + "All data files must have the same version", + location!(), + )); + } + } + } + Ok(file_version) + } } impl TryFrom for Fragment { diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 174463ab46..09c8502ac2 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -436,10 +436,16 @@ impl TryFrom for Manifest { let data_storage_format = match p.data_format { None => { - if has_deprecated_v2_feature_flag(p.writer_feature_flags) { - DataStorageFormat::new(LanceFileVersion::V2_0) + if fragments.is_empty() { + // No fragments to inspect, best we can do is look at writer flags + if has_deprecated_v2_feature_flag(p.writer_feature_flags) { + DataStorageFormat::new(LanceFileVersion::Legacy) + } else { + DataStorageFormat::new(LanceFileVersion::Stable) + } } else { - DataStorageFormat::new(LanceFileVersion::Legacy) + // If there are fragments, they are a better indicator + DataStorageFormat::new(Fragment::try_infer_version(fragments.as_ref())?) } } Some(format) => DataStorageFormat::from(format), diff --git a/rust/lance/src/dataset/transaction.rs b/rust/lance/src/dataset/transaction.rs index 0946a04549..f2cb1ec324 100644 --- a/rust/lance/src/dataset/transaction.rs +++ b/rust/lance/src/dataset/transaction.rs @@ -329,6 +329,31 @@ impl Transaction { }) } + fn data_storage_format_from_files( + fragments: &[Fragment], + user_requested: Option, + ) -> Result { + // If no files use user-requested or default + if fragments.is_empty() { + return Ok(user_requested + .map(DataStorageFormat::new) + .unwrap_or_default()); + } + // Otherwise we need to check the actual file versions + let file_version = Fragment::try_infer_version(fragments)?; + + // Ensure user-requested matches data files + if let Some(user_requested) = user_requested { + if user_requested != file_version { + return Err(Error::invalid_input( + format!("User requested data storage version ({}) does not match version in data files ({})", user_requested, file_version), + location!(), + )); + } + } + Ok(DataStorageFormat::new(file_version)) + } + pub(crate) async fn restore_old_manifest( object_store: &ObjectStore, commit_handler: &dyn CommitHandler, @@ -568,21 +593,26 @@ impl Transaction { // If a fragment was reserved then it may not belong at the end of the fragments list. final_fragments.sort_by_key(|frag| frag.id); - let data_storage_format = match (&config.storage_format, config.use_legacy_format) { - (Some(storage_format), _) => storage_format.clone(), - (None, Some(true)) => DataStorageFormat::new(LanceFileVersion::Legacy), - (None, Some(false)) => DataStorageFormat::new(LanceFileVersion::V2_0), - (None, None) => DataStorageFormat::default(), + let user_requested_version = match (&config.storage_format, config.use_legacy_format) { + (Some(storage_format), _) => Some(storage_format.lance_file_version()?), + (None, Some(true)) => Some(LanceFileVersion::Legacy), + (None, Some(false)) => Some(LanceFileVersion::V2_0), + (None, None) => None, }; let mut manifest = if let Some(current_manifest) = current_manifest { let mut prev_manifest = Manifest::new_from_previous(current_manifest, schema, Arc::new(final_fragments)); if matches!(self.operation, Operation::Overwrite { .. }) { - prev_manifest.data_storage_format = data_storage_format; + prev_manifest.data_storage_format = Self::data_storage_format_from_files( + &prev_manifest.fragments, + user_requested_version, + )?; } prev_manifest } else { + let data_storage_format = + Self::data_storage_format_from_files(&final_fragments, user_requested_version)?; Manifest::new(schema, Arc::new(final_fragments), data_storage_format) }; From 2d0e6e94270fed076cfc76bb118aa52977a7d19d Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 19 Aug 2024 11:35:34 -0700 Subject: [PATCH 2/5] Add unit tests covering migration from no data format version to yes data format version --- .../0_15_0_v1_no_files.lance/_latest.manifest | Bin 0 -> 123 bytes ...0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn | 1 + ...0-b2bbb126-958f-48ef-81f7-b33deac71663.txn | 1 + .../_versions/1.manifest | Bin 0 -> 123 bytes .../_latest.manifest | Bin 0 -> 176 bytes ...0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn | Bin 0 -> 121 bytes ...0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn | Bin 0 -> 121 bytes .../_versions/1.manifest | Bin 0 -> 176 bytes ...320f7c03-94b1-4107-941f-eaab4a75792d.lance | Bin 0 -> 318 bytes ...7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance | Bin 0 -> 318 bytes .../0_15_0_v2_no_files.lance/_latest.manifest | Bin 0 -> 125 bytes ...0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn | 1 + ...0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn | 1 + .../_versions/1.manifest | Bin 0 -> 125 bytes .../_latest.manifest | Bin 0 -> 183 bytes ...0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn | Bin 0 -> 126 bytes ...0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn | Bin 0 -> 121 bytes .../_versions/1.manifest | Bin 0 -> 183 bytes ...38639abd-798b-4005-87a1-39d7a151ee30.lance | Bin 0 -> 232 bytes ...f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance | Bin 0 -> 318 bytes .../tests/historical_datasets/README.md | 4 +++ python/python/tests/test_migration.py | 34 ++++++++++++++++++ rust/lance-table/src/format/manifest.rs | 4 +-- 23 files changed, 44 insertions(+), 2 deletions(-) create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_latest.manifest create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_versions/1.manifest create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_latest.manifest create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_versions/1.manifest create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance create mode 100644 python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_latest.manifest create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_versions/1.manifest create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_latest.manifest create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_versions/1.manifest create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/data/38639abd-798b-4005-87a1-39d7a151ee30.lance create mode 100644 python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance create mode 100644 python/python/tests/historical_datasets/README.md create mode 100644 python/python/tests/test_migration.py diff --git a/python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_latest.manifest b/python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_latest.manifest new file mode 100644 index 0000000000000000000000000000000000000000..058715267fa0d7b4ea8eb6a41705d87cea4b170d GIT binary patch literal 123 zcmYdiU|`^q6k@DU_zwe&TCACQC1xfDj24U%j8@znU*`91V-xsxvMnP?%Ro2DC@Cq) z(8x^J($peN*Tf<g8Cic z=RXrdvIV0<{`#Xu(sq4?f*Q(7w#*70v`{)A7-=g&=_FW02da@PqnxxR8h*506Bs4= zXinFY@$-7Rr?cDR(z-%`k63GqB?M&yp!X6CdJAQAbuD8w%2?hH-C>#~-%u*JR9t-a Kekj>(_S-j;V=y5A literal 0 HcmV?d00001 diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn b/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn new file mode 100644 index 0000000000000000000000000000000000000000..b156a4cfebb13eccf435fa28f9cc7b60a34513d1 GIT binary patch literal 121 zcmXxWF%H5o3;;kP1Vcw}Y#o>?8JyUOlV9+KI87uZ)QN$2@j+nWri1gEE5%}rEYVO@ z4XurYgL~&xLeJ{o$J@kkofZx>v=L(w^C+J5H$Sk~YEm_73G3xix7NY!<~^Liz2?uK MTMqZ_nS@++Uvf?(0{{R3 literal 0 HcmV?d00001 diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn b/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn new file mode 100644 index 0000000000000000000000000000000000000000..6bee1e63f4f34034689369df2b466cae24554b1d GIT binary patch literal 121 zcmXxRF%H5o3;;kZ1Vcw}Y#o>?8EWD*&M)|a)7TOc>cqgi_#jN&;kk#@Vv@+SIZ5wK zD%~hWtCb@(t9_35A>nnHikCST7gDx`gd3BIG|DUAG&eR4y*zDO9bq@_@r3Xmzke_t K9@}e?85Dl8kt3Y| literal 0 HcmV?d00001 diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_versions/1.manifest b/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_versions/1.manifest new file mode 100644 index 0000000000000000000000000000000000000000..925075c03dce3bbfd9a8b36db792e601627209cb GIT binary patch literal 176 zcmXxeF$%&k6oBCbX^}c8nRM%1crmdrP3k7D;vJGEfr@nM&{KE<(F-_x39sX>g8Cic z=RXrdvIV0<{`#Xu(sq4?f*Q(7w#*70v`{)A7-=g&=_FW02da@PqnxxR8h*506Bs4= zXinFY@$-7Rr?cDR(z-%`k63GqB?M&yp!X6CdJAQAbuD8w%2?hH-C>#~-%u*JR9t-a Kekj>(_S-j;V=y5A literal 0 HcmV?d00001 diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance b/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance new file mode 100644 index 0000000000000000000000000000000000000000..c7e354cdbe606c623436d4eaa9fb51d1e6c34848 GIT binary patch literal 318 zcmZQ%fB+^a%?zbs6gr;~hdP*g4JZSq&H#;XfyUPZ^0_317%LS1!vLceYi3@FnTY|T z1)~IG7C#qjPGVkist}ujo}sCpK?P6;M+2h}69cn^gjOz>m=L1@vSzm8lA_Y&5-xEe zuDsHmocQGY(!3H0n4uPoToOW@xtV$KWr;bZsS->`mVspwE1)vWNHRh!42(?75)PZ7 Ou4iCiV(@YFa|QsCYc#n4 literal 0 HcmV?d00001 diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance b/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance new file mode 100644 index 0000000000000000000000000000000000000000..c7e354cdbe606c623436d4eaa9fb51d1e6c34848 GIT binary patch literal 318 zcmZQ%fB+^a%?zbs6gr;~hdP*g4JZSq&H#;XfyUPZ^0_317%LS1!vLceYi3@FnTY|T z1)~IG7C#qjPGVkist}ujo}sCpK?P6;M+2h}69cn^gjOz>m=L1@vSzm8lA_Y&5-xEe zuDsHmocQGY(!3H0n4uPoToOW@xtV$KWr;bZsS->`mVspwE1)vWNHRh!42(?75)PZ7 Ou4iCiV(@YFa|QsCYc#n4 literal 0 HcmV?d00001 diff --git a/python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_latest.manifest b/python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_latest.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cf387121da71970b3e4b2e4988249a0155bc320c GIT binary patch literal 125 zcmc~~U|`^q6k@DU_zwe&TCACQC1xfDj24U%j8@znU*`91V-q;G;9*7pOOlp>uA#AE zT54LNnXb7>a-yzDl2MwjrBRZxu4!7TWuj?fqIqhPiC#%XUKT$WYffTba;gxUfu5nM Qo&nGuA#AE zT54LNnXb7>a-yzDl2MwjrBRZxu4!7TWuj?fqIqhPiC#%XUKT$WYffTba;gxUfu5nM Qo&nGtJe_hQDU`f(4 z&^0hMPP0r-Ox86uvNY8-F)}jJO-wYi)HO6QF;2EfwM-%^g#PKkZWKvry&MFJC_X%aTfTZ?84@RP?O*VP)JY{I)QN$2@j+nW=EJkcr7p=MNiT_V z?~RO|DRjvuW-@DeA8$j#>oBc_*H+vpxXLdxld?K_v&gY};5c6%b@Ps}8~1oZc#WSw Mm=5>tDLH_`7lL0S2mk;8 literal 0 HcmV?d00001 diff --git a/python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_versions/1.manifest b/python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_versions/1.manifest new file mode 100644 index 0000000000000000000000000000000000000000..debde5af1b40b8934345ba472fcd02c086b0e693 GIT binary patch literal 183 zcmZ3?z`(#IDa2Tz@E-;kwOBLrO3X|Q7%douEQCzBw2Upxj4czBQgqENEs}Ii3=B+l zEzA=Qb&V}kfRw3WYO1k;UQS|Oa;gv`gA^l!2D1XQ1fvx<$Cvqi+t>tJe_hQDU`f(4 z&^0hMPP0r-Ox86uvNY8-F)}jJO-wYi)HO6QF;2EfwMOh?lrO}@#UNzE#mFGUC?UbDWF%z3rNm=L1@vSzm8lA_Y&5-xEe zuDsHmocQGY(!3H0n4uPoToOW@xtV$KWr;bZsS->`mVspwE1)vWNHRh!42(?75)PZ7 Ou4iCiV(@YFa|QsCYc#n4 literal 0 HcmV?d00001 diff --git a/python/python/tests/historical_datasets/README.md b/python/python/tests/historical_datasets/README.md new file mode 100644 index 0000000000..9903a10f1d --- /dev/null +++ b/python/python/tests/historical_datasets/README.md @@ -0,0 +1,4 @@ +# Historical Datasets + +This folder cotnains datasets from older versions of Lance that we use for backwards +compatibility testing. diff --git a/python/python/tests/test_migration.py b/python/python/tests/test_migration.py new file mode 100644 index 0000000000..8aad8decf2 --- /dev/null +++ b/python/python/tests/test_migration.py @@ -0,0 +1,34 @@ +import shutil +from pathlib import Path + +import lance +import pyarrow as pa + + +def prep_dataset(tmp_path: Path, name: str): + dataset_dir = Path(__file__).parent / "historical_datasets" / name + shutil.copytree(dataset_dir, tmp_path / name) + return lance.dataset(tmp_path / name) + + +def test_add_data_storage_version(tmp_path: Path): + """ + In version 0.15 and below we did not have a data storage version. We had + writer flags that were used to determine if we should use the old or new + storage format. In version 0.16 we added a data storage version to the + manifest that should be correctly populated from existing files and/or the + writer flags + """ + tab = pa.table({"x": range(1024)}) + + def check_dataset(dataset_name: str, expected_version: str): + ds = prep_dataset(tmp_path, dataset_name) + assert ds.data_storage_version == expected_version + + lance.write_dataset(tab, ds.uri, mode="append") + assert ds.data_storage_version == expected_version + + check_dataset("0_15_0_v1_no_files.lance", "0.1") + check_dataset("0_15_0_v2_no_files.lance", "2.0") + check_dataset("0_15_0_v1_with_files.lance", "0.1") + check_dataset("0_15_0_v2_with_files.lance", "2.0") diff --git a/rust/lance-table/src/format/manifest.rs b/rust/lance-table/src/format/manifest.rs index 09c8502ac2..0d7ee43dcd 100644 --- a/rust/lance-table/src/format/manifest.rs +++ b/rust/lance-table/src/format/manifest.rs @@ -439,9 +439,9 @@ impl TryFrom for Manifest { if fragments.is_empty() { // No fragments to inspect, best we can do is look at writer flags if has_deprecated_v2_feature_flag(p.writer_feature_flags) { - DataStorageFormat::new(LanceFileVersion::Legacy) - } else { DataStorageFormat::new(LanceFileVersion::Stable) + } else { + DataStorageFormat::new(LanceFileVersion::Legacy) } } else { // If there are fragments, they are a better indicator From b4229e814a612bcd4c2398ddf568d1cc9f32d3c0 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 19 Aug 2024 11:36:33 -0700 Subject: [PATCH 3/5] Add license header --- python/python/tests/test_migration.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/python/tests/test_migration.py b/python/python/tests/test_migration.py index 8aad8decf2..ced45a4dcd 100644 --- a/python/python/tests/test_migration.py +++ b/python/python/tests/test_migration.py @@ -1,3 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright The Lance Authors + import shutil from pathlib import Path From 034cafc0ea68a8f2db73d22e1efd465949bf2abe Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 19 Aug 2024 12:07:23 -0700 Subject: [PATCH 4/5] Update python/python/tests/historical_datasets/README.md Co-authored-by: Will Jones --- python/python/tests/historical_datasets/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/python/tests/historical_datasets/README.md b/python/python/tests/historical_datasets/README.md index 9903a10f1d..e7ba34e462 100644 --- a/python/python/tests/historical_datasets/README.md +++ b/python/python/tests/historical_datasets/README.md @@ -1,4 +1,4 @@ # Historical Datasets -This folder cotnains datasets from older versions of Lance that we use for backwards +This folder contains datasets from older versions of Lance that we use for backwards compatibility testing. From 849d48a9bf390fb201a6eb25b840b05e443692b8 Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 19 Aug 2024 12:12:22 -0700 Subject: [PATCH 5/5] Move historical datasets into test_data folder --- python/python/tests/test_migration.py | 20 ++++++++++-------- .../v0.15.0}/README.md | 0 .../v1_no_files.lance}/_latest.manifest | Bin ...0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn | 0 ...0-b2bbb126-958f-48ef-81f7-b33deac71663.txn | 0 .../v1_no_files.lance}/_versions/1.manifest | Bin .../v1_with_files.lance}/_latest.manifest | Bin ...0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn | Bin ...0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn | Bin .../v1_with_files.lance}/_versions/1.manifest | Bin ...320f7c03-94b1-4107-941f-eaab4a75792d.lance | Bin ...7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance | Bin .../v2_no_files.lance}/_latest.manifest | Bin ...0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn | 0 ...0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn | 0 .../v2_no_files.lance}/_versions/1.manifest | Bin .../v2_with_files.lance}/_latest.manifest | Bin ...0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn | Bin ...0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn | Bin .../v2_with_files.lance}/_versions/1.manifest | Bin ...38639abd-798b-4005-87a1-39d7a151ee30.lance | Bin ...f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance | Bin 22 files changed, 11 insertions(+), 9 deletions(-) rename {python/python/tests/historical_datasets => test_data/v0.15.0}/README.md (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_no_files.lance => test_data/v0.15.0/v1_no_files.lance}/_latest.manifest (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_no_files.lance => test_data/v0.15.0/v1_no_files.lance}/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_no_files.lance => test_data/v0.15.0/v1_no_files.lance}/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_no_files.lance => test_data/v0.15.0/v1_no_files.lance}/_versions/1.manifest (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_with_files.lance => test_data/v0.15.0/v1_with_files.lance}/_latest.manifest (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_with_files.lance => test_data/v0.15.0/v1_with_files.lance}/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_with_files.lance => test_data/v0.15.0/v1_with_files.lance}/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_with_files.lance => test_data/v0.15.0/v1_with_files.lance}/_versions/1.manifest (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_with_files.lance => test_data/v0.15.0/v1_with_files.lance}/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance (100%) rename {python/python/tests/historical_datasets/0_15_0_v1_with_files.lance => test_data/v0.15.0/v1_with_files.lance}/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_no_files.lance => test_data/v0.15.0/v2_no_files.lance}/_latest.manifest (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_no_files.lance => test_data/v0.15.0/v2_no_files.lance}/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_no_files.lance => test_data/v0.15.0/v2_no_files.lance}/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_no_files.lance => test_data/v0.15.0/v2_no_files.lance}/_versions/1.manifest (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_with_files.lance => test_data/v0.15.0/v2_with_files.lance}/_latest.manifest (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_with_files.lance => test_data/v0.15.0/v2_with_files.lance}/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_with_files.lance => test_data/v0.15.0/v2_with_files.lance}/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_with_files.lance => test_data/v0.15.0/v2_with_files.lance}/_versions/1.manifest (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_with_files.lance => test_data/v0.15.0/v2_with_files.lance}/data/38639abd-798b-4005-87a1-39d7a151ee30.lance (100%) rename {python/python/tests/historical_datasets/0_15_0_v2_with_files.lance => test_data/v0.15.0/v2_with_files.lance}/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance (100%) diff --git a/python/python/tests/test_migration.py b/python/python/tests/test_migration.py index ced45a4dcd..3988245f4e 100644 --- a/python/python/tests/test_migration.py +++ b/python/python/tests/test_migration.py @@ -8,10 +8,12 @@ import pyarrow as pa -def prep_dataset(tmp_path: Path, name: str): - dataset_dir = Path(__file__).parent / "historical_datasets" / name - shutil.copytree(dataset_dir, tmp_path / name) - return lance.dataset(tmp_path / name) +def prep_dataset(tmp_path: Path, version, name: str): + dataset_dir = ( + Path(__file__).parent.parent.parent.parent / "test_data" / version / name + ) + shutil.copytree(dataset_dir, tmp_path / version / name) + return lance.dataset(tmp_path / version / name) def test_add_data_storage_version(tmp_path: Path): @@ -25,13 +27,13 @@ def test_add_data_storage_version(tmp_path: Path): tab = pa.table({"x": range(1024)}) def check_dataset(dataset_name: str, expected_version: str): - ds = prep_dataset(tmp_path, dataset_name) + ds = prep_dataset(tmp_path, "v0.15.0", dataset_name) assert ds.data_storage_version == expected_version lance.write_dataset(tab, ds.uri, mode="append") assert ds.data_storage_version == expected_version - check_dataset("0_15_0_v1_no_files.lance", "0.1") - check_dataset("0_15_0_v2_no_files.lance", "2.0") - check_dataset("0_15_0_v1_with_files.lance", "0.1") - check_dataset("0_15_0_v2_with_files.lance", "2.0") + check_dataset("v1_no_files.lance", "0.1") + check_dataset("v2_no_files.lance", "2.0") + check_dataset("v1_with_files.lance", "0.1") + check_dataset("v2_with_files.lance", "2.0") diff --git a/python/python/tests/historical_datasets/README.md b/test_data/v0.15.0/README.md similarity index 100% rename from python/python/tests/historical_datasets/README.md rename to test_data/v0.15.0/README.md diff --git a/python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_latest.manifest b/test_data/v0.15.0/v1_no_files.lance/_latest.manifest similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_latest.manifest rename to test_data/v0.15.0/v1_no_files.lance/_latest.manifest diff --git a/python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn b/test_data/v0.15.0/v1_no_files.lance/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn rename to test_data/v0.15.0/v1_no_files.lance/_transactions/0-4a3f63b9-e75a-42ed-8dad-b737ebd29e5f.txn diff --git a/python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn b/test_data/v0.15.0/v1_no_files.lance/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn rename to test_data/v0.15.0/v1_no_files.lance/_transactions/0-b2bbb126-958f-48ef-81f7-b33deac71663.txn diff --git a/python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_versions/1.manifest b/test_data/v0.15.0/v1_no_files.lance/_versions/1.manifest similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_no_files.lance/_versions/1.manifest rename to test_data/v0.15.0/v1_no_files.lance/_versions/1.manifest diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_latest.manifest b/test_data/v0.15.0/v1_with_files.lance/_latest.manifest similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_latest.manifest rename to test_data/v0.15.0/v1_with_files.lance/_latest.manifest diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn b/test_data/v0.15.0/v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn rename to test_data/v0.15.0/v1_with_files.lance/_transactions/0-d8a059b3-345b-477b-bcf6-f3ee3692e871.txn diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn b/test_data/v0.15.0/v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn rename to test_data/v0.15.0/v1_with_files.lance/_transactions/0-df497a43-e5ae-4dd1-84da-3fb991ff858a.txn diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_versions/1.manifest b/test_data/v0.15.0/v1_with_files.lance/_versions/1.manifest similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/_versions/1.manifest rename to test_data/v0.15.0/v1_with_files.lance/_versions/1.manifest diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance b/test_data/v0.15.0/v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance rename to test_data/v0.15.0/v1_with_files.lance/data/320f7c03-94b1-4107-941f-eaab4a75792d.lance diff --git a/python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance b/test_data/v0.15.0/v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance rename to test_data/v0.15.0/v1_with_files.lance/data/7e7057e4-81a6-41b1-a44b-5c1285b1a8f8.lance diff --git a/python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_latest.manifest b/test_data/v0.15.0/v2_no_files.lance/_latest.manifest similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_latest.manifest rename to test_data/v0.15.0/v2_no_files.lance/_latest.manifest diff --git a/python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn b/test_data/v0.15.0/v2_no_files.lance/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn rename to test_data/v0.15.0/v2_no_files.lance/_transactions/0-131fefa6-74ca-4b2f-92b3-5fe9a5aa7eb4.txn diff --git a/python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn b/test_data/v0.15.0/v2_no_files.lance/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn rename to test_data/v0.15.0/v2_no_files.lance/_transactions/0-ffe96c42-4ab0-4b8a-845b-31faa8b24e4d.txn diff --git a/python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_versions/1.manifest b/test_data/v0.15.0/v2_no_files.lance/_versions/1.manifest similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_no_files.lance/_versions/1.manifest rename to test_data/v0.15.0/v2_no_files.lance/_versions/1.manifest diff --git a/python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_latest.manifest b/test_data/v0.15.0/v2_with_files.lance/_latest.manifest similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_latest.manifest rename to test_data/v0.15.0/v2_with_files.lance/_latest.manifest diff --git a/python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn b/test_data/v0.15.0/v2_with_files.lance/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn rename to test_data/v0.15.0/v2_with_files.lance/_transactions/0-053f9cac-5295-4222-aa69-1443c8e9ce9d.txn diff --git a/python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn b/test_data/v0.15.0/v2_with_files.lance/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn rename to test_data/v0.15.0/v2_with_files.lance/_transactions/0-c1bab3c1-6e53-4eed-92c2-0e6496996785.txn diff --git a/python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_versions/1.manifest b/test_data/v0.15.0/v2_with_files.lance/_versions/1.manifest similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/_versions/1.manifest rename to test_data/v0.15.0/v2_with_files.lance/_versions/1.manifest diff --git a/python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/data/38639abd-798b-4005-87a1-39d7a151ee30.lance b/test_data/v0.15.0/v2_with_files.lance/data/38639abd-798b-4005-87a1-39d7a151ee30.lance similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/data/38639abd-798b-4005-87a1-39d7a151ee30.lance rename to test_data/v0.15.0/v2_with_files.lance/data/38639abd-798b-4005-87a1-39d7a151ee30.lance diff --git a/python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance b/test_data/v0.15.0/v2_with_files.lance/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance similarity index 100% rename from python/python/tests/historical_datasets/0_15_0_v2_with_files.lance/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance rename to test_data/v0.15.0/v2_with_files.lance/data/f52fd5c9-0c86-4e22-828c-cd9121ac0217.lance