From ff451751c7742dece5b45320518e3b9e70f4a1c9 Mon Sep 17 00:00:00 2001 From: mxwli Date: Fri, 12 Apr 2024 14:21:56 -0400 Subject: [PATCH 1/5] offset tests --- tools/python_api/test/test_df_pyarrow.py | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/python_api/test/test_df_pyarrow.py b/tools/python_api/test/test_df_pyarrow.py index e5ae62600e..cdc485cd5b 100644 --- a/tools/python_api/test/test_df_pyarrow.py +++ b/tools/python_api/test/test_df_pyarrow.py @@ -233,6 +233,27 @@ def test_pyarrow_dict(conn_db_readonly : ConnDB) -> None: for expected, actual in zip(df[colname], result[colname]): assert expected == actual +def test_pyarrow_dict_offset(conn_db_readonly : ConnDB) -> None: + random.seed(100) + datalength = 4000 + index = pa.array(range(datalength), type=pa.int64()) + indices = pa.array([random.randint(0, 2) for _ in range(datalength)]) + dictionary = pa.array([1, 2, 3, 4]) + mask = pa.array([random.choice([True, False]) for _ in range(datalength)]) + col1 = pa.DictionaryArray.from_arrays(indices, dictionary.slice(1, 3), mask=mask) + df = pd.DataFrame({ + 'index': arrowtopd(index), + 'col1': arrowtopd(col1) + }) + result = conn.execute("LOAD FROM df RETURN * ORDER BY index") + idx = 0 + while result.has_next() + assert idx < len(index) + nxt = result.get_next() + proc = [idx, col1[idx].as_py()] + assert proc == nxt + idx += 1 + def test_pyarrow_list(conn_db_readonly : ConnDB) -> None: conn, db = conn_db_readonly random.seed(100) @@ -259,6 +280,9 @@ def test_pyarrow_list(conn_db_readonly : ConnDB) -> None: assert idx == len(index) +def test_pyarrow_list_offset(conn_db_readonly : ConnDB) -> None: + pass + def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None: conn, db = conn_db_readonly random.seed(100) @@ -287,6 +311,9 @@ def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None: assert idx == len(index) +def test_pyarrow_struct_offset(conn_db_readonly : ConnDB) -> None: + pass + def test_pyarrow_union(conn_db_readonly : ConnDB) -> None: pytest.skip("unions are not very well supported by kuzu in general") conn, db = conn_db_readonly @@ -338,3 +365,6 @@ def test_pyarrow_map(conn_db_readonly : ConnDB) -> None: idx += 1 assert idx == len(index) + +def test_pyarrow_map_offset(conn_db_readonly : ConnDB) -> None: + pass From f49aa68128abf3c85e74e8b727a2480e05dae4cc Mon Sep 17 00:00:00 2001 From: mxwli Date: Fri, 12 Apr 2024 14:57:19 -0400 Subject: [PATCH 2/5] list offset --- src/common/arrow/arrow_array_scan.cpp | 3 +- src/common/arrow/arrow_null_mask_tree.cpp | 4 +-- tools/python_api/test/test_df_pyarrow.py | 36 +++++++++++++++++++---- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/src/common/arrow/arrow_array_scan.cpp b/src/common/arrow/arrow_array_scan.cpp index 9d37d904cd..c3e31e4f2e 100644 --- a/src/common/arrow/arrow_array_scan.cpp +++ b/src/common/arrow/arrow_array_scan.cpp @@ -174,7 +174,8 @@ static void scanArrowArrayList(const ArrowSchema* schema, const ArrowArray* arra } ValueVector* auxiliaryBuffer = ListVector::getDataVector(&outputVector); ArrowConverter::fromArrowArray(schema->children[0], array->children[0], *auxiliaryBuffer, - mask->getChild(0), offsets[0], auxDstPosition, offsets[count] - offsets[0]); + mask->getChild(0), offsets[0] + array->children[0]->offset, auxDstPosition, + offsets[count] - offsets[0]); } template diff --git a/src/common/arrow/arrow_null_mask_tree.cpp b/src/common/arrow/arrow_null_mask_tree.cpp index e9d7405aa6..4644998746 100644 --- a/src/common/arrow/arrow_null_mask_tree.cpp +++ b/src/common/arrow/arrow_null_mask_tree.cpp @@ -56,8 +56,8 @@ void ArrowNullMaskTree::scanListPushDown(const ArrowSchema* schema, const ArrowA pushDownMask.setNullFromRange(offsets[i] - offsets[0], offsets[i + 1] - offsets[i], isNull(i)); } - children->push_back(ArrowNullMaskTree(schema->children[0], array->children[0], offsets[0], - auxiliaryLength, &pushDownMask)); + children->push_back(ArrowNullMaskTree(schema->children[0], array->children[0], + offsets[0] + array->children[0]->offset, auxiliaryLength, &pushDownMask)); } void ArrowNullMaskTree::scanStructPushDown(const ArrowSchema* schema, const ArrowArray* array, diff --git a/tools/python_api/test/test_df_pyarrow.py b/tools/python_api/test/test_df_pyarrow.py index cdc485cd5b..150f8d6bd7 100644 --- a/tools/python_api/test/test_df_pyarrow.py +++ b/tools/python_api/test/test_df_pyarrow.py @@ -234,25 +234,27 @@ def test_pyarrow_dict(conn_db_readonly : ConnDB) -> None: assert expected == actual def test_pyarrow_dict_offset(conn_db_readonly : ConnDB) -> None: + conn, db = conn_db_readonly random.seed(100) datalength = 4000 index = pa.array(range(datalength), type=pa.int64()) indices = pa.array([random.randint(0, 2) for _ in range(datalength)]) dictionary = pa.array([1, 2, 3, 4]) - mask = pa.array([random.choice([True, False]) for _ in range(datalength)]) - col1 = pa.DictionaryArray.from_arrays(indices, dictionary.slice(1, 3), mask=mask) + col1 = pa.DictionaryArray.from_arrays(indices, dictionary.slice(1, 3)) df = pd.DataFrame({ 'index': arrowtopd(index), 'col1': arrowtopd(col1) }) result = conn.execute("LOAD FROM df RETURN * ORDER BY index") idx = 0 - while result.has_next() + while result.has_next(): assert idx < len(index) nxt = result.get_next() proc = [idx, col1[idx].as_py()] assert proc == nxt idx += 1 + + assert idx == len(index) def test_pyarrow_list(conn_db_readonly : ConnDB) -> None: conn, db = conn_db_readonly @@ -281,7 +283,29 @@ def test_pyarrow_list(conn_db_readonly : ConnDB) -> None: assert idx == len(index) def test_pyarrow_list_offset(conn_db_readonly : ConnDB) -> None: - pass + conn, db = conn_db_readonly + random.seed(100) + datalength = 50 + childlength = 5 + index = pa.array(range(datalength)) + values = pa.array([generate_primitive('int32[pyarrow]') for _ in range(datalength * childlength + 2)]) + offsets = pa.array(sorted([random.randint(0, datalength * childlength + 1) for _ in range(datalength + 1)])) + mask = pa.array([random.choice([True, False]) for _ in range(datalength)]) + col1 = pa.ListArray.from_arrays(values=values.slice(2, datalength * childlength), offsets=offsets, mask=mask) + df = pd.DataFrame({ + 'index': arrowtopd(index), + 'col1': arrowtopd(col1), + }) + result = conn.execute('LOAD FROM df RETURN * ORDER BY index') + idx = 0 + while result.has_next(): + assert idx < len(index) + nxt = result.get_next() + proc = [idx, col1[idx].as_py()] + assert proc == nxt + idx += 1 + + assert idx == len(index) def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None: conn, db = conn_db_readonly @@ -312,7 +336,7 @@ def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None: assert idx == len(index) def test_pyarrow_struct_offset(conn_db_readonly : ConnDB) -> None: - pass + conn, db = conn_db_readonly def test_pyarrow_union(conn_db_readonly : ConnDB) -> None: pytest.skip("unions are not very well supported by kuzu in general") @@ -367,4 +391,4 @@ def test_pyarrow_map(conn_db_readonly : ConnDB) -> None: assert idx == len(index) def test_pyarrow_map_offset(conn_db_readonly : ConnDB) -> None: - pass + conn, db = conn_db_readonly From 1c60dbd99cff89d8e5a167a9de356f41a236d29e Mon Sep 17 00:00:00 2001 From: mxwli Date: Fri, 12 Apr 2024 15:26:06 -0400 Subject: [PATCH 3/5] apply list scan fixes --- src/common/arrow/arrow_array_scan.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/common/arrow/arrow_array_scan.cpp b/src/common/arrow/arrow_array_scan.cpp index c3e31e4f2e..ea83ee44a2 100644 --- a/src/common/arrow/arrow_array_scan.cpp +++ b/src/common/arrow/arrow_array_scan.cpp @@ -164,12 +164,11 @@ static void scanArrowArrayList(const ArrowSchema* schema, const ArrowArray* arra uint64_t auxDstPosition = 0; for (uint64_t i = 0; i < count; i++) { auto curOffset = offsets[i], nextOffset = offsets[i + 1]; - if (!mask->isNull(i)) { - auto newEntry = ListVector::addList(&outputVector, nextOffset - curOffset); - outputVector.setValue(i + dstOffset, newEntry); - if (i == 0) { - auxDstPosition = newEntry.offset; - } + // don't check for validity, since we still need to update the offsets + auto newEntry = ListVector::addList(&outputVector, nextOffset - curOffset); + outputVector.setValue(i + dstOffset, newEntry); + if (i == 0) { + auxDstPosition = newEntry.offset; } } ValueVector* auxiliaryBuffer = ListVector::getDataVector(&outputVector); From 67701846042cdad115b98fb5d013c73b88084aa5 Mon Sep 17 00:00:00 2001 From: mxwli Date: Fri, 12 Apr 2024 16:06:25 -0400 Subject: [PATCH 4/5] add struct fixes --- src/common/arrow/arrow_array_scan.cpp | 4 ++-- src/common/arrow/arrow_null_mask_tree.cpp | 4 ++-- tools/python_api/test/test_df_pyarrow.py | 23 +++++++++++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/src/common/arrow/arrow_array_scan.cpp b/src/common/arrow/arrow_array_scan.cpp index ea83ee44a2..33d8bf51b7 100644 --- a/src/common/arrow/arrow_array_scan.cpp +++ b/src/common/arrow/arrow_array_scan.cpp @@ -222,8 +222,8 @@ static void scanArrowArrayStruct(const ArrowSchema* schema, const ArrowArray* ar } for (int64_t j = 0; j < schema->n_children; j++) { ArrowConverter::fromArrowArray(schema->children[j], array->children[j], - *StructVector::getFieldVector(&outputVector, j).get(), mask->getChild(j), srcOffset, - dstOffset, count); + *StructVector::getFieldVector(&outputVector, j).get(), mask->getChild(j), + srcOffset + array->children[j]->offset, dstOffset, count); } } diff --git a/src/common/arrow/arrow_null_mask_tree.cpp b/src/common/arrow/arrow_null_mask_tree.cpp index 4644998746..a322051bca 100644 --- a/src/common/arrow/arrow_null_mask_tree.cpp +++ b/src/common/arrow/arrow_null_mask_tree.cpp @@ -63,8 +63,8 @@ void ArrowNullMaskTree::scanListPushDown(const ArrowSchema* schema, const ArrowA void ArrowNullMaskTree::scanStructPushDown(const ArrowSchema* schema, const ArrowArray* array, uint64_t srcOffset, uint64_t count) { for (int64_t i = 0; i < array->n_children; i++) { - children->push_back(ArrowNullMaskTree(schema->children[i], array->children[i], srcOffset, - count, mask.get())); + children->push_back(ArrowNullMaskTree(schema->children[i], array->children[i], + srcOffset + array->children[i]->offset, count, mask.get())); } } diff --git a/tools/python_api/test/test_df_pyarrow.py b/tools/python_api/test/test_df_pyarrow.py index 150f8d6bd7..384771fad8 100644 --- a/tools/python_api/test/test_df_pyarrow.py +++ b/tools/python_api/test/test_df_pyarrow.py @@ -337,6 +337,29 @@ def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None: def test_pyarrow_struct_offset(conn_db_readonly : ConnDB) -> None: conn, db = conn_db_readonly + random.seed(100) + datalength = 4096 + index = pa.array(range(datalength)) + val1 = pa.array([generate_primitive('int32[pyarrow]') for _ in range(datalength+1)]) + val2 = pa.array([generate_primitive('bool[pyarrow]') for _ in range(datalength+2)]) + val3 = pa.array([generate_string(random.randint(5, 10)) for _ in range(datalength+3)]) + mask = pa.array([random.choice([True, False]) for _ in range(datalength)]) + col1 = pa.StructArray.from_arrays([val1.slice(1, datalength), val2.slice(2, datalength), val3.slice(3, datalength)], names=['a', 'b', 'c'], mask=mask) + df = pd.DataFrame({ + 'index': arrowtopd(index), + 'col1': arrowtopd(col1) + }) + result = conn.execute('LOAD FROM df RETURN * ORDER BY index') + idx = 0 + while result.has_next(): + assert idx < len(index) + nxt = result.get_next() + expected = [idx, col1[idx].as_py()] + assert expected == nxt + idx += 1 + + assert idx == len(index) + def test_pyarrow_union(conn_db_readonly : ConnDB) -> None: pytest.skip("unions are not very well supported by kuzu in general") From 9bd484d7b6131efc7c9ce1c9b360d65e19b694b4 Mon Sep 17 00:00:00 2001 From: mxwli Date: Fri, 12 Apr 2024 16:27:44 -0400 Subject: [PATCH 5/5] add map tests --- tools/python_api/test/test_df_pyarrow.py | 26 ++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tools/python_api/test/test_df_pyarrow.py b/tools/python_api/test/test_df_pyarrow.py index 384771fad8..95e0d04deb 100644 --- a/tools/python_api/test/test_df_pyarrow.py +++ b/tools/python_api/test/test_df_pyarrow.py @@ -415,3 +415,29 @@ def test_pyarrow_map(conn_db_readonly : ConnDB) -> None: def test_pyarrow_map_offset(conn_db_readonly : ConnDB) -> None: conn, db = conn_db_readonly + random.seed(100) + datalength = 50 + maplength = 5 + index = pa.array(range(datalength)) + offsets = sorted([random.randint(0, datalength * maplength + 1) for _ in range(datalength + 1)]) + offsets[25] = None + offsets = pa.array(offsets, type=pa.int32()) + keys = pa.array([random.randint(0, (1<<31)-1) for _ in range(datalength * maplength + 1)]) + values = pa.array([generate_primitive('int64[pyarrow]') for _ in range(datalength * maplength + 1)]) + _col1 = pa.MapArray.from_arrays(offsets, keys.slice(1, datalength * maplength), values.slice(1, datalength * maplength)) + col1 = _col1.slice(2, 48) + df = pd.DataFrame({ + 'index': arrowtopd(index.slice(0, 48)), + 'col1': arrowtopd(col1), + }) + result = conn.execute('LOAD FROM df RETURN * ORDER BY index') + idx = 0 + while result.has_next(): + assert idx < len(index) + nxt = result.get_next() + expected = [idx, None if col1[idx].as_py() is None else dict(col1[idx].as_py())] + assert expected == nxt + idx += 1 + + assert idx == 48 + \ No newline at end of file