Skip to content

Commit

Permalink
Merge branch 'main' into docs_artefact_management
Browse files Browse the repository at this point in the history
  • Loading branch information
tanaymeh committed Jun 25, 2024
2 parents 238a396 + f51c5f0 commit 4e76c26
Show file tree
Hide file tree
Showing 74 changed files with 2,905 additions and 2,604 deletions.
105 changes: 54 additions & 51 deletions .github/workflows/java.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
name: Build and Run Java JNI Tests

on:
push:
branches:
Expand All @@ -8,6 +9,7 @@ on:
- java/**
- rust/**
- .github/workflows/java.yml

env:
# This env var is used by Swatinem/rust-cache@v2 for the cache
# key, so we set it to make sure it is always consistent.
Expand All @@ -20,74 +22,75 @@ env:
# CI builds are faster with incremental disabled.
CARGO_INCREMENTAL: "0"
CARGO_BUILD_JOBS: "1"

jobs:
linux-build:
rust-clippy-fmt:
runs-on: ubuntu-22.04
name: ubuntu-22.04 + Java 11 & 17
name: Rust Clippy and Fmt Check
defaults:
run:
working-directory: ./java
working-directory: ./java/core/lance-jni
steps:
- name: Checkout repository
uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
with:
workspaces: java/java-jni
workspaces: java/core/lance-jni
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Run cargo fmt
run: cargo fmt --check
working-directory: ./java/core/lance-jni
- name: Rust Clippy
run: cargo clippy --all-targets -- -D warnings

build-and-test-java:
runs-on: ubuntu-22.04
strategy:
matrix:
java-version: [8, 11, 17]
name: Build and Test with Java ${{ matrix.java-version }}
defaults:
run:
working-directory: ./java
steps:
- name: Checkout repository
uses: actions/checkout@v4
- uses: Swatinem/rust-cache@v2
with:
workspaces: java/core/lance-jni
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y protobuf-compiler libssl-dev
- name: Install Java 17
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 17
cache: "maven"
- run: echo "JAVA_17=$JAVA_HOME" >> $GITHUB_ENV
- name: Install Java 8
- name: Set up Java ${{ matrix.java-version }}
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 8
java-version: ${{ matrix.java-version }}
cache: "maven"
- run: echo "JAVA_8=$JAVA_HOME" >> $GITHUB_ENV
- name: Install Java 11
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 11
cache: "maven"
- name: Java Style Check
run: mvn checkstyle:check
- name: Rust Clippy
working-directory: java/core/lance-jni
run: cargo clippy --all-targets -- -D warnings
- name: Running tests with Java 11
run: mvn clean test
- name: Running tests with Java 8
run: JAVA_HOME=$JAVA_8 mvn clean test
- name: Running tests with Java 17
- name: Running tests with Java ${{ matrix.java-version }}
run: |
export JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS \
-XX:+IgnoreUnrecognizedVMOptions \
--add-opens=java.base/java.lang=ALL-UNNAMED \
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED \
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED \
--add-opens=java.base/java.io=ALL-UNNAMED \
--add-opens=java.base/java.net=ALL-UNNAMED \
--add-opens=java.base/java.nio=ALL-UNNAMED \
--add-opens=java.base/java.util=ALL-UNNAMED \
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED \
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED \
--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED \
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED \
--add-opens=java.base/sun.security.action=ALL-UNNAMED \
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED \
--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED \
-Djdk.reflect.useDirectMethodHandle=false \
-Dio.netty.tryReflectionSetAccessible=true"
JAVA_HOME=$JAVA_17 mvn clean test
if [ "${{ matrix.java-version }}" == "17" ]; then
export JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS \
-XX:+IgnoreUnrecognizedVMOptions \
--add-opens=java.base/java.lang=ALL-UNNAMED \
--add-opens=java.base/java.lang.invoke=ALL-UNNAMED \
--add-opens=java.base/java.lang.reflect=ALL-UNNAMED \
--add-opens=java.base/java.io=ALL-UNNAMED \
--add-opens=java.base/java.net=ALL-UNNAMED \
--add-opens=java.base/java.nio=ALL-UNNAMED \
--add-opens=java.base/java.util=ALL-UNNAMED \
--add-opens=java.base/java.util.concurrent=ALL-UNNAMED \
--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED \
--add-opens=java.base/jdk.internal.ref=ALL-UNNAMED \
--add-opens=java.base/sun.nio.ch=ALL-UNNAMED \
--add-opens=java.base/sun.nio.cs=ALL-UNNAMED \
--add-opens=java.base/sun.security.action=ALL-UNNAMED \
--add-opens=java.base/sun.util.calendar=ALL-UNNAMED \
--add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED \
-Djdk.reflect.useDirectMethodHandle=false \
-Dio.netty.tryReflectionSetAccessible=true"
fi
mvn clean test
3 changes: 2 additions & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ jobs:
run: sudo rm -rf target/wheels
linux-arm:
timeout-minutes: 45
runs-on: warp-ubuntu-latest-arm64-4x
#runs-on: warp-ubuntu-latest-arm64-4x
runs-on: buildjet-4vcpu-ubuntu-2204-arm
name: Python Linux 3.${{ matrix.python-minor-version }} ARM
strategy:
matrix:
Expand Down
32 changes: 16 additions & 16 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ exclude = ["python"]
resolver = "2"

[workspace.package]
version = "0.12.4"
version = "0.13.1"
edition = "2021"
authors = ["Lance Devs <dev@lancedb.com>"]
license = "Apache-2.0"
Expand All @@ -40,23 +40,23 @@ categories = [
"development-tools",
"science",
]
rust-version = "1.75"
rust-version = "1.78"

[workspace.dependencies]
lance = { version = "=0.12.4", path = "./rust/lance" }
lance-arrow = { version = "=0.12.4", path = "./rust/lance-arrow" }
lance-core = { version = "=0.12.4", path = "./rust/lance-core" }
lance-datafusion = { version = "=0.12.4", path = "./rust/lance-datafusion" }
lance-datagen = { version = "=0.12.4", path = "./rust/lance-datagen" }
lance-encoding = { version = "=0.12.4", path = "./rust/lance-encoding" }
lance-encoding-datafusion = { version = "=0.12.4", path = "./rust/lance-encoding-datafusion" }
lance-file = { version = "=0.12.4", path = "./rust/lance-file" }
lance-index = { version = "=0.12.4", path = "./rust/lance-index" }
lance-io = { version = "=0.12.4", path = "./rust/lance-io" }
lance-linalg = { version = "=0.12.4", path = "./rust/lance-linalg" }
lance-table = { version = "=0.12.4", path = "./rust/lance-table" }
lance-test-macros = { version = "=0.12.4", path = "./rust/lance-test-macros" }
lance-testing = { version = "=0.12.4", path = "./rust/lance-testing" }
lance = { version = "=0.13.1", path = "./rust/lance" }
lance-arrow = { version = "=0.13.1", path = "./rust/lance-arrow" }
lance-core = { version = "=0.13.1", path = "./rust/lance-core" }
lance-datafusion = { version = "=0.13.1", path = "./rust/lance-datafusion" }
lance-datagen = { version = "=0.13.1", path = "./rust/lance-datagen" }
lance-encoding = { version = "=0.13.1", path = "./rust/lance-encoding" }
lance-encoding-datafusion = { version = "=0.13.1", path = "./rust/lance-encoding-datafusion" }
lance-file = { version = "=0.13.1", path = "./rust/lance-file" }
lance-index = { version = "=0.13.1", path = "./rust/lance-index" }
lance-io = { version = "=0.13.1", path = "./rust/lance-io" }
lance-linalg = { version = "=0.13.1", path = "./rust/lance-linalg" }
lance-table = { version = "=0.13.1", path = "./rust/lance-table" }
lance-test-macros = { version = "=0.13.1", path = "./rust/lance-test-macros" }
lance-testing = { version = "=0.13.1", path = "./rust/lance-testing" }
approx = "0.5.1"
# Note that this one does not include pyarrow
arrow = { version = "51.0.0", optional = false, features = ["prettyprint"] }
Expand Down
5 changes: 5 additions & 0 deletions docs/integrations/pytorch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,8 @@ Available samplers:

- :class:`lance.sampler.ShardedFragmentSampler`
- :class:`lance.sampler.ShardedBatchSampler`

.. warning::
For multiprocessing you should probably not use fork as lance is
multi-threaded internally and fork and multi-thread do not work well.
Refer to `this discussion <https://discuss.python.org/t/concerns-regarding-deprecation-of-fork-with-alive-threads/33555>`_.
5 changes: 4 additions & 1 deletion docs/integrations/tensorflow.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,7 @@ workers.
for batch in ds:
print(batch)
.. warning::
For multiprocessing you should probably not use fork as lance is
multi-threaded internally and fork and multi-thread do not work well.
Refer to `this discussion <https://discuss.python.org/t/concerns-regarding-deprecation-of-fork-with-alive-threads/33555>`_.
7 changes: 3 additions & 4 deletions java/core/lance-jni/src/fragment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,6 @@ use crate::{
RT,
};

///////////////////
// Write Methods //
///////////////////

//////////////////
// Read Methods //
//////////////////
Expand Down Expand Up @@ -70,6 +66,9 @@ fn inner_count_rows_native(
Ok(res)
}

///////////////////
// Write Methods //
///////////////////
#[no_mangle]
pub extern "system" fn Java_com_lancedb_lance_Fragment_createWithFfiArray<'local>(
mut env: JNIEnv<'local>,
Expand Down
2 changes: 1 addition & 1 deletion java/core/src/main/java/com/lancedb/lance/Dataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ public long latestVersion() {
/**
* Count the number of rows in the dataset.
*
* @return num of rows.
* @return num of rows
*/
public int countRows() {
try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) {
Expand Down
1 change: 1 addition & 0 deletions protos/encodings.proto
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ message SimpleStruct {}
message Binary {
ArrayEncoding indices = 1;
ArrayEncoding bytes = 2;
uint64 null_adjustment = 3;
}

// Encodings that decode into an Arrow array
Expand Down
7 changes: 3 additions & 4 deletions protos/file2.proto
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,9 @@ import "google/protobuf/empty.proto";
//
// * Each Lance file contains between 0 and 4Gi columns
// * Each column contains between 0 and 4Gi pages
// * Each page contains between 0 and 4Gi items
// * Each page contains between 0 and 2^64 items
// * Different pages within a column can have different items counts
// * Columns may have more than 4Gi items, though this will require more than
// one page
// * Columns may have up to 2^64 items
// * Different columns within a file can have different item counts
//
// The Lance file format does not have any notion of a type system or schemas.
Expand Down Expand Up @@ -178,7 +177,7 @@ message ColumnMetadata {
// may be empty.
repeated uint64 buffer_sizes = 2;
// Logical length (e.g. # rows) of the page
uint32 length = 3;
uint64 length = 3;
// The encoding used to encode the page
Encoding encoding = 4;
}
Expand Down
4 changes: 2 additions & 2 deletions protos/table.proto
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ message DataFile {
// - dimension: packed-struct (0):
// - x: u32 (1)
// - y: u32 (2)
// - path: string (3)
// - path: list<u32> (3)
// - embedding: fsl<768> (4)
// - fp64
// - borders: fsl<4> (5)
Expand All @@ -249,7 +249,7 @@ message DataFile {
// This reflects quite a few phenomenon:
// - The packed struct is encoded into a single column and there is no top-level column
// for the x or y fields
// - The string is encoded into two columns
// - The variable sized list is encoded into two columns
// - The embedding is encoded into a single column (common for FSL of primitive) and there
// is not "FSL column"
// - The borders field actually does have an "FSL column"
Expand Down
2 changes: 1 addition & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "pylance"
version = "0.12.4"
version = "0.13.1"
edition = "2021"
authors = ["Lance Devs <dev@lancedb.com>"]
rust-version = "1.65"
Expand Down
8 changes: 1 addition & 7 deletions python/python/lance/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .dependencies import _check_for_numpy, _check_for_pandas
from .dependencies import numpy as np
from .dependencies import pandas as pd
from .lance import _build_sq_storage, _Hnsw, _KMeans
from .lance import _Hnsw, _KMeans

if TYPE_CHECKING:
ts_types = Union[datetime, pd.Timestamp, str]
Expand Down Expand Up @@ -245,9 +245,3 @@ def to_lance_file(self, file_path):

def vectors(self) -> pa.Array:
return self._hnsw.vectors()


def build_sq_storage(
row_ids_array: Iterator[pa.Array], vectors_array: pa.Array, dim, bounds: tuple
) -> pa.RecordBatch:
return _build_sq_storage(row_ids_array, vectors_array, dim, bounds)
2 changes: 0 additions & 2 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ pub(crate) mod utils;
pub use crate::arrow::{bfloat16_array, BFloat16};
use crate::fragment::{cleanup_partial_writes, write_fragments};
pub use crate::tracing::{trace_to_chrome, TraceGuard};
use crate::utils::build_sq_storage;
use crate::utils::Hnsw;
use crate::utils::KMeans;
pub use dataset::write_dataset;
Expand Down Expand Up @@ -142,7 +141,6 @@ fn lance(py: Python, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(cleanup_partial_writes))?;
m.add_wrapped(wrap_pyfunction!(trace_to_chrome))?;
m.add_wrapped(wrap_pyfunction!(manifest_needs_migration))?;
m.add_wrapped(wrap_pyfunction!(build_sq_storage))?;
// Debug functions
m.add_wrapped(wrap_pyfunction!(debug::format_schema))?;
m.add_wrapped(wrap_pyfunction!(debug::format_manifest))?;
Expand Down
Loading

0 comments on commit 4e76c26

Please sign in to comment.