From 0eefb070062bae397d74952bda2942749cdf3fc7 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Wed, 20 Mar 2024 21:20:33 +0800 Subject: [PATCH] GH-36026: [Python] Fix ORC test segfault in the python wheel windows test (#40609) ### Rationale for this change The pyarrow orc reader always crashes when it tries to create an internal orc reader. This is caused by failing to read tz database on the local host. This also disables windows wheel build when ARROW_ORC is turned on. ### What changes are included in this PR? Download IANA timezone database on the test host and explicitly setting TZDIR to make the CIs happy. ### Are these changes tested? Make sure all python wheel windows CIs pass. ### Are there any user-facing changes? No. * GitHub Issue: #36026 Authored-by: Gang Wu Signed-off-by: Sutou Kouhei --- .env | 2 +- .github/workflows/java_jni.yml | 2 +- appveyor.yml | 1 + ci/appveyor-cpp-build.bat | 3 ++- ci/docker/python-wheel-windows-test-vs2019.dockerfile | 3 +++ ci/docker/python-wheel-windows-vs2019.dockerfile | 1 + ci/scripts/python_wheel_windows_build.bat | 2 +- ci/scripts/python_wheel_windows_test.bat | 9 ++++++++- cpp/src/arrow/adapters/orc/adapter.cc | 2 +- 9 files changed, 19 insertions(+), 6 deletions(-) diff --git a/.env b/.env index 877bd50864d38..9e58f68bfb98c 100644 --- a/.env +++ b/.env @@ -98,7 +98,7 @@ VCPKG="a42af01b72c28a8e1d7b48107b33e4f286a55ef6" # 2023.11.20 Release # ci/docker/python-wheel-windows-vs2019.dockerfile. # This is a workaround for our CI problem that "archery docker build" doesn't # use pulled built images in dev/tasks/python-wheels/github.windows.yml. -PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-03-12 +PYTHON_WHEEL_WINDOWS_IMAGE_REVISION=2024-03-19 # Use conanio/${CONAN} for "docker-compose run --rm conan". See # https://github.com/conan-io/conan-docker-tools#readme for available diff --git a/.github/workflows/java_jni.yml b/.github/workflows/java_jni.yml index 45de57f360a42..3fa88fa4ae40e 100644 --- a/.github/workflows/java_jni.yml +++ b/.github/workflows/java_jni.yml @@ -52,7 +52,7 @@ jobs: name: AMD64 manylinux2014 Java JNI runs-on: ubuntu-latest if: ${{ !contains(github.event.pull_request.title, 'WIP') }} - timeout-minutes: 500 + timeout-minutes: 90 steps: - name: Checkout Arrow uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0 diff --git a/appveyor.yml b/appveyor.yml index d0f9082833cc9..5954251d34733 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -44,6 +44,7 @@ environment: ARROW_BUILD_FLIGHT_SQL: "ON" ARROW_BUILD_GANDIVA: "ON" ARROW_GCS: "ON" + ARROW_ORC: "ON" ARROW_S3: "ON" GENERATOR: Ninja PYTHON: "3.10" diff --git a/ci/appveyor-cpp-build.bat b/ci/appveyor-cpp-build.bat index ab85032fe9924..8cfa67c437264 100644 --- a/ci/appveyor-cpp-build.bat +++ b/ci/appveyor-cpp-build.bat @@ -80,7 +80,7 @@ cmake -G "%GENERATOR%" %ARROW_CMAKE_ARGS% ^ -DARROW_HDFS=ON ^ -DARROW_JSON=ON ^ -DARROW_MIMALLOC=ON ^ - -DARROW_ORC=ON ^ + -DARROW_ORC=%ARROW_ORC% ^ -DARROW_PARQUET=ON ^ -DARROW_S3=%ARROW_S3% ^ -DARROW_SUBSTRAIT=ON ^ @@ -125,6 +125,7 @@ set PYARROW_WITH_DATASET=ON set PYARROW_WITH_FLIGHT=%ARROW_BUILD_FLIGHT% set PYARROW_WITH_GANDIVA=%ARROW_BUILD_GANDIVA% set PYARROW_WITH_GCS=%ARROW_GCS% +set PYARROW_WITH_ORC=%ARROW_ORC% set PYARROW_WITH_PARQUET=ON set PYARROW_WITH_PARQUET_ENCRYPTION=ON set PYARROW_WITH_S3=%ARROW_S3% diff --git a/ci/docker/python-wheel-windows-test-vs2019.dockerfile b/ci/docker/python-wheel-windows-test-vs2019.dockerfile index 67d99fa9c5724..819324a74e12a 100644 --- a/ci/docker/python-wheel-windows-test-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-test-vs2019.dockerfile @@ -42,3 +42,6 @@ RUN (if "%python%"=="3.8" setx PYTHON_VERSION "3.8.10" && setx PATH "%PATH%;C:\P (if "%python%"=="3.12" setx PYTHON_VERSION "3.12.0" && setx PATH "%PATH%;C:\Python312;C:\Python312\Scripts") RUN choco install -r -y --no-progress python --version=%PYTHON_VERSION% RUN python -m pip install -U pip setuptools + +# Install archiver to extract xz archives +RUN choco install --no-progress -r -y archiver diff --git a/ci/docker/python-wheel-windows-vs2019.dockerfile b/ci/docker/python-wheel-windows-vs2019.dockerfile index b8e8aad952b1c..0ab5071abb86c 100644 --- a/ci/docker/python-wheel-windows-vs2019.dockerfile +++ b/ci/docker/python-wheel-windows-vs2019.dockerfile @@ -66,6 +66,7 @@ RUN vcpkg install \ --x-feature=flight \ --x-feature=gcs \ --x-feature=json \ + --x-feature=orc \ --x-feature=parquet \ --x-feature=s3 diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 73b0192d9bc97..54f02ec6f6ed0 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -36,7 +36,7 @@ set ARROW_FLIGHT=ON set ARROW_GANDIVA=OFF set ARROW_GCS=ON set ARROW_HDFS=ON -set ARROW_ORC=OFF +set ARROW_ORC=ON set ARROW_PARQUET=ON set PARQUET_REQUIRE_ENCRYPTION=ON set ARROW_MIMALLOC=ON diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index b14bfddfb36d3..a928c3571d0cb 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -24,7 +24,7 @@ set PYARROW_TEST_FLIGHT=ON set PYARROW_TEST_GANDIVA=OFF set PYARROW_TEST_GCS=ON set PYARROW_TEST_HDFS=ON -set PYARROW_TEST_ORC=OFF +set PYARROW_TEST_ORC=ON set PYARROW_TEST_PARQUET=ON set PYARROW_TEST_PARQUET_ENCRYPTION=ON set PYARROW_TEST_SUBSTRAIT=ON @@ -56,8 +56,15 @@ python -c "import pyarrow.dataset" || exit /B 1 python -c "import pyarrow.flight" || exit /B 1 python -c "import pyarrow.fs" || exit /B 1 python -c "import pyarrow.json" || exit /B 1 +python -c "import pyarrow.orc" || exit /B 1 python -c "import pyarrow.parquet" || exit /B 1 python -c "import pyarrow.substrait" || exit /B 1 +@rem Download IANA Timezone Database for ORC C++ +curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B +mkdir %USERPROFILE%\Downloads\test\tzdata +arc unarchive tzdata.tar.xz %USERPROFILE%\Downloads\test\tzdata +set TZDIR=%USERPROFILE%\Downloads\test\tzdata\usr\share\zoneinfo + @REM Execute unittest pytest -r s --pyargs pyarrow || exit /B 1 diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc index 31fba1a9bd50e..2100e701f3302 100644 --- a/cpp/src/arrow/adapters/orc/adapter.cc +++ b/cpp/src/arrow/adapters/orc/adapter.cc @@ -491,7 +491,7 @@ class ORCFileReader::Impl { if (!include_indices.empty()) { RETURN_NOT_OK(SelectIndices(&opts, include_indices)); } - StripeInformation stripe_info({0, 0, 0, 0}); + StripeInformation stripe_info{0, 0, 0, 0}; RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info)); ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); std::unique_ptr row_reader;