Merge branch 'branch-23.08' into branch-23.10-merge-23.08

vyasr · Jul 26, 2023 · 5a3f9fc · 5a3f9fc
2 parents 427f879 + 2231b15
commit 5a3f9fc
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 35 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -79,7 +79,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -88,7 +88,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yml@branch-23.10
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-test.yaml@branch-23.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.10" and (.CUDA_VER == "11.8.0" or .CUDA_VER == "12.0.1")))
       build_type: nightly

diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
@@ -16,7 +16,7 @@ source:
 
 build:
   number: {{ GIT_DESCRIBE_NUMBER }}
-  string: py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
   script_env:
     - AWS_ACCESS_KEY_ID
     - AWS_SECRET_ACCESS_KEY

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -388,7 +388,7 @@ outputs:
     script: install_libcudf_example.sh
     build:
       number: {{ GIT_DESCRIBE_NUMBER }}
-      string: {{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
       ignore_run_exports_from:
         {% if cuda_major == "11" %}
         - {{ compiler('cuda11') }}

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
@@ -871,45 +871,65 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
                                             level_type lvl)
 {
   int32_t len;
-  int level_bits    = s->col.level_bits[lvl];
-  Encoding encoding = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
-                                                    : s->page.repetition_level_encoding;
+  int const level_bits = s->col.level_bits[lvl];
+  auto const encoding  = lvl == level_type::DEFINITION ? s->page.definition_level_encoding
+                                                       : s->page.repetition_level_encoding;
 
   auto start = cur;
-  if (level_bits == 0) {
-    len                       = 0;
-    s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
-    s->initial_rle_value[lvl] = 0;
-    s->lvl_start[lvl]         = cur;
-    s->abs_lvl_start[lvl]     = cur;
-  } else if (encoding == Encoding::RLE) {
-    // V2 only uses RLE encoding, so only perform check here
-    if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
-      len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
-    } else if (cur + 4 < end) {
-      len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
-      cur += 4;
-    } else {
-      len      = 0;
-      s->error = 2;
-    }
-    s->abs_lvl_start[lvl] = cur;
-    if (!s->error) {
-      uint32_t run            = get_vlq32(cur, end);
-      s->initial_rle_run[lvl] = run;
-      if (!(run & 1)) {
-        int v = (cur < end) ? cur[0] : 0;
+
+  auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) {
+    uint32_t const run      = get_vlq32(cur, end);
+    s->initial_rle_run[lvl] = run;
+    if (!(run & 1)) {
+      if (cur < end) {
+        int v = cur[0];
         cur++;
         if (level_bits > 8) {
           v |= ((cur < end) ? cur[0] : 0) << 8;
           cur++;
         }
         s->initial_rle_value[lvl] = v;
+      } else {
+        s->initial_rle_value[lvl] = 0;
       }
-      s->lvl_start[lvl] = cur;
     }
+    s->lvl_start[lvl] = cur;
 
     if (cur > end) { s->error = 2; }
+  };
+
+  // this is a little redundant. if level_bits == 0, then nothing should be encoded
+  // for the level, but some V2 files in the wild violate this and encode the data anyway.
+  // thus we will handle V2 headers separately.
+  if ((s->page.flags & PAGEINFO_FLAGS_V2) != 0) {
+    // V2 only uses RLE encoding so no need to check encoding
+    len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
+    s->abs_lvl_start[lvl] = cur;
+    if (len == 0) {
+      s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
+      s->initial_rle_value[lvl] = 0;
+      s->lvl_start[lvl]         = cur;
+    } else {
+      init_rle(cur, cur + len);
+    }
+  } else if (level_bits == 0) {
+    len                       = 0;
+    s->initial_rle_run[lvl]   = s->page.num_input_values * 2;  // repeated value
+    s->initial_rle_value[lvl] = 0;
+    s->lvl_start[lvl]         = cur;
+    s->abs_lvl_start[lvl]     = cur;
+  } else if (encoding == Encoding::RLE) {  // V1 header with RLE encoding
+    if (cur + 4 < end) {
+      len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
+      cur += 4;
+      s->abs_lvl_start[lvl] = cur;
+      init_rle(cur, cur + len);
+      // add back the 4 bytes for the length
+      len += 4;
+    } else {
+      len      = 0;
+      s->error = 2;
+    }
   } else if (encoding == Encoding::BIT_PACKED) {
     len                       = (s->page.num_input_values * level_bits + 7) >> 3;
     s->initial_rle_run[lvl]   = ((s->page.num_input_values + 7) >> 3) * 2 + 1;  // literal run
@@ -1247,7 +1267,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           s->dict_val  = 0;
           if ((s->col.data_type & 7) == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
           break;
-        case Encoding::RLE: s->dict_run = 0; break;
+        case Encoding::RLE: {
+          // first 4 bytes are length of RLE data
+          int const len = (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
+          cur += 4;
+          if (cur + len > end) { s->error = 2; }
+          s->dict_run = 0;
+        } break;
         default:
           s->error = 1;  // Unsupported encoding
           break;

diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
@@ -386,6 +386,7 @@ __global__ void __launch_bounds__(128)
         // definition levels
         bs->page.chunk_row += bs->page.num_rows;
         bs->page.num_rows = 0;
+        bs->page.flags    = 0;
         // zero out V2 info
         bs->page.num_nulls     = 0;
         bs->page.def_lvl_bytes = 0;
@@ -395,7 +396,6 @@ __global__ void __launch_bounds__(128)
             case PageType::DATA_PAGE:
               index_out = num_dict_pages + data_page_count;
               data_page_count++;
-              bs->page.flags = 0;
               // this computation is only valid for flat schemas. for nested schemas,
               // they will be recomputed in the preprocess step by examining repetition and
               // definition levels
@@ -405,7 +405,7 @@ __global__ void __launch_bounds__(128)
             case PageType::DATA_PAGE_V2:
               index_out = num_dict_pages + data_page_count;
               data_page_count++;
-              bs->page.flags = 0;
+              bs->page.flags |= PAGEINFO_FLAGS_V2;
               values_found += bs->page.num_input_values;
               // V2 only uses RLE, so it was removed from the header
               bs->page.definition_level_encoding = Encoding::RLE;
@@ -414,7 +414,7 @@ __global__ void __launch_bounds__(128)
             case PageType::DICTIONARY_PAGE:
               index_out = dictionary_page_count;
               dictionary_page_count++;
-              bs->page.flags = PAGEINFO_FLAGS_DICTIONARY;
+              bs->page.flags |= PAGEINFO_FLAGS_DICTIONARY;
               break;
             default: index_out = -1; break;
           }

diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -74,6 +74,7 @@ namespace gpu {
  */
 enum {
   PAGEINFO_FLAGS_DICTIONARY = (1 << 0),  // Indicates a dictionary page
+  PAGEINFO_FLAGS_V2         = (1 << 1),  // V2 page header
 };
 
 /**

diff --git a/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet b/python/cudf/cudf/tests/data/parquet/rle_boolean_encoding.parquet
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
@@ -2518,6 +2518,15 @@ def test_parquet_reader_binary_decimal(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_rle_boolean(datadir):
+    fname = datadir / "rle_boolean_encoding.parquet"
+
+    expect = pd.read_parquet(fname)
+    got = cudf.read_parquet(fname)
+
+    assert_eq(expect, got)
+
+
 # testing a specific bug-fix/edge case.
 # specifically:  int a parquet file containing a particular way of representing
 #                a list column in a schema, the cudf reader was confusing