From ff451751c7742dece5b45320518e3b9e70f4a1c9 Mon Sep 17 00:00:00 2001
From: mxwli <maxwli@outlook.com>
Date: Fri, 12 Apr 2024 14:21:56 -0400
Subject: [PATCH 1/5] offset tests

---
 tools/python_api/test/test_df_pyarrow.py | 30 ++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
diff --git a/tools/python_api/test/test_df_pyarrow.py b/tools/python_api/test/test_df_pyarrow.py
index e5ae62600e..cdc485cd5b 100644
--- a/tools/python_api/test/test_df_pyarrow.py
+++ b/tools/python_api/test/test_df_pyarrow.py
@@ -233,6 +233,27 @@ def test_pyarrow_dict(conn_db_readonly : ConnDB) -> None:
         for expected, actual in zip(df[colname], result[colname]):
             assert expected == actual
 
+def test_pyarrow_dict_offset(conn_db_readonly : ConnDB) -> None:
+    random.seed(100)
+    datalength = 4000
+    index = pa.array(range(datalength), type=pa.int64())
+    indices = pa.array([random.randint(0, 2) for _ in range(datalength)])
+    dictionary = pa.array([1, 2, 3, 4])
+    mask = pa.array([random.choice([True, False]) for _ in range(datalength)])
+    col1 = pa.DictionaryArray.from_arrays(indices, dictionary.slice(1, 3), mask=mask)
+    df = pd.DataFrame({
+        'index': arrowtopd(index),
+        'col1': arrowtopd(col1)
+    })
+    result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
+    idx = 0
+    while result.has_next()
+        assert idx < len(index)
+        nxt = result.get_next()
+        proc = [idx, col1[idx].as_py()]
+        assert proc == nxt
+        idx += 1
+
 def test_pyarrow_list(conn_db_readonly : ConnDB) -> None:
     conn, db = conn_db_readonly
     random.seed(100)
@@ -259,6 +280,9 @@ def test_pyarrow_list(conn_db_readonly : ConnDB) -> None:
     
     assert idx == len(index)
 
+def test_pyarrow_list_offset(conn_db_readonly : ConnDB) -> None:
+    pass
+
 def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None:
     conn, db = conn_db_readonly
     random.seed(100)
@@ -287,6 +311,9 @@ def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None:
     
     assert idx == len(index)
 
+def test_pyarrow_struct_offset(conn_db_readonly : ConnDB) -> None:
+    pass
+
 def test_pyarrow_union(conn_db_readonly : ConnDB) -> None:
     pytest.skip("unions are not very well supported by kuzu in general")
     conn, db = conn_db_readonly
@@ -338,3 +365,6 @@ def test_pyarrow_map(conn_db_readonly : ConnDB) -> None:
         idx += 1
 
     assert idx == len(index)
+
+def test_pyarrow_map_offset(conn_db_readonly : ConnDB) -> None:
+    pass

From f49aa68128abf3c85e74e8b727a2480e05dae4cc Mon Sep 17 00:00:00 2001
From: mxwli <maxwli@outlook.com>
Date: Fri, 12 Apr 2024 14:57:19 -0400
Subject: [PATCH 2/5] list offset

---
 src/common/arrow/arrow_array_scan.cpp     |  3 +-
 src/common/arrow/arrow_null_mask_tree.cpp |  4 +--
 tools/python_api/test/test_df_pyarrow.py  | 36 +++++++++++++++++++----
 3 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/src/common/arrow/arrow_array_scan.cpp b/src/common/arrow/arrow_array_scan.cpp
index 9d37d904cd..c3e31e4f2e 100644
--- a/src/common/arrow/arrow_array_scan.cpp
+++ b/src/common/arrow/arrow_array_scan.cpp
@@ -174,7 +174,8 @@ static void scanArrowArrayList(const ArrowSchema* schema, const ArrowArray* arra
     }
     ValueVector* auxiliaryBuffer = ListVector::getDataVector(&outputVector);
     ArrowConverter::fromArrowArray(schema->children[0], array->children[0], *auxiliaryBuffer,
-        mask->getChild(0), offsets[0], auxDstPosition, offsets[count] - offsets[0]);
+        mask->getChild(0), offsets[0] + array->children[0]->offset, auxDstPosition,
+        offsets[count] - offsets[0]);
 }
 
 template<typename offsetsT>
diff --git a/src/common/arrow/arrow_null_mask_tree.cpp b/src/common/arrow/arrow_null_mask_tree.cpp
index e9d7405aa6..4644998746 100644
--- a/src/common/arrow/arrow_null_mask_tree.cpp
+++ b/src/common/arrow/arrow_null_mask_tree.cpp
@@ -56,8 +56,8 @@ void ArrowNullMaskTree::scanListPushDown(const ArrowSchema* schema, const ArrowA
         pushDownMask.setNullFromRange(offsets[i] - offsets[0], offsets[i + 1] - offsets[i],
             isNull(i));
     }
-    children->push_back(ArrowNullMaskTree(schema->children[0], array->children[0], offsets[0],
-        auxiliaryLength, &pushDownMask));
+    children->push_back(ArrowNullMaskTree(schema->children[0], array->children[0],
+        offsets[0] + array->children[0]->offset, auxiliaryLength, &pushDownMask));
 }
 
 void ArrowNullMaskTree::scanStructPushDown(const ArrowSchema* schema, const ArrowArray* array,
diff --git a/tools/python_api/test/test_df_pyarrow.py b/tools/python_api/test/test_df_pyarrow.py
index cdc485cd5b..150f8d6bd7 100644
--- a/tools/python_api/test/test_df_pyarrow.py
+++ b/tools/python_api/test/test_df_pyarrow.py
@@ -234,25 +234,27 @@ def test_pyarrow_dict(conn_db_readonly : ConnDB) -> None:
             assert expected == actual
 
 def test_pyarrow_dict_offset(conn_db_readonly : ConnDB) -> None:
+    conn, db = conn_db_readonly
     random.seed(100)
     datalength = 4000
     index = pa.array(range(datalength), type=pa.int64())
     indices = pa.array([random.randint(0, 2) for _ in range(datalength)])
     dictionary = pa.array([1, 2, 3, 4])
-    mask = pa.array([random.choice([True, False]) for _ in range(datalength)])
-    col1 = pa.DictionaryArray.from_arrays(indices, dictionary.slice(1, 3), mask=mask)
+    col1 = pa.DictionaryArray.from_arrays(indices, dictionary.slice(1, 3))
     df = pd.DataFrame({
         'index': arrowtopd(index),
         'col1': arrowtopd(col1)
     })
     result = conn.execute("LOAD FROM df RETURN * ORDER BY index")
     idx = 0
-    while result.has_next()
+    while result.has_next():
         assert idx < len(index)
         nxt = result.get_next()
         proc = [idx, col1[idx].as_py()]
         assert proc == nxt
         idx += 1
+    
+    assert idx == len(index)
 
 def test_pyarrow_list(conn_db_readonly : ConnDB) -> None:
     conn, db = conn_db_readonly
@@ -281,7 +283,29 @@ def test_pyarrow_list(conn_db_readonly : ConnDB) -> None:
     assert idx == len(index)
 
 def test_pyarrow_list_offset(conn_db_readonly : ConnDB) -> None:
-    pass
+    conn, db = conn_db_readonly
+    random.seed(100)
+    datalength = 50
+    childlength = 5
+    index = pa.array(range(datalength))
+    values = pa.array([generate_primitive('int32[pyarrow]') for _ in range(datalength * childlength + 2)])
+    offsets = pa.array(sorted([random.randint(0, datalength * childlength + 1) for _ in range(datalength + 1)]))
+    mask = pa.array([random.choice([True, False]) for _ in range(datalength)])
+    col1 = pa.ListArray.from_arrays(values=values.slice(2, datalength * childlength), offsets=offsets, mask=mask)
+    df = pd.DataFrame({
+        'index': arrowtopd(index),
+        'col1': arrowtopd(col1),
+    })
+    result = conn.execute('LOAD FROM df RETURN * ORDER BY index')
+    idx = 0
+    while result.has_next():
+        assert idx < len(index)
+        nxt = result.get_next()
+        proc = [idx, col1[idx].as_py()]
+        assert proc == nxt
+        idx += 1
+
+    assert idx == len(index)
 
 def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None:
     conn, db = conn_db_readonly
@@ -312,7 +336,7 @@ def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None:
     assert idx == len(index)
 
 def test_pyarrow_struct_offset(conn_db_readonly : ConnDB) -> None:
-    pass
+    conn, db = conn_db_readonly
 
 def test_pyarrow_union(conn_db_readonly : ConnDB) -> None:
     pytest.skip("unions are not very well supported by kuzu in general")
@@ -367,4 +391,4 @@ def test_pyarrow_map(conn_db_readonly : ConnDB) -> None:
     assert idx == len(index)
 
 def test_pyarrow_map_offset(conn_db_readonly : ConnDB) -> None:
-    pass
+    conn, db = conn_db_readonly

From 1c60dbd99cff89d8e5a167a9de356f41a236d29e Mon Sep 17 00:00:00 2001
From: mxwli <maxwli@outlook.com>
Date: Fri, 12 Apr 2024 15:26:06 -0400
Subject: [PATCH 3/5] apply list scan fixes

---
 src/common/arrow/arrow_array_scan.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/common/arrow/arrow_array_scan.cpp b/src/common/arrow/arrow_array_scan.cpp
index c3e31e4f2e..ea83ee44a2 100644
--- a/src/common/arrow/arrow_array_scan.cpp
+++ b/src/common/arrow/arrow_array_scan.cpp
@@ -164,12 +164,11 @@ static void scanArrowArrayList(const ArrowSchema* schema, const ArrowArray* arra
     uint64_t auxDstPosition = 0;
     for (uint64_t i = 0; i < count; i++) {
         auto curOffset = offsets[i], nextOffset = offsets[i + 1];
-        if (!mask->isNull(i)) {
-            auto newEntry = ListVector::addList(&outputVector, nextOffset - curOffset);
-            outputVector.setValue<list_entry_t>(i + dstOffset, newEntry);
-            if (i == 0) {
-                auxDstPosition = newEntry.offset;
-            }
+        // don't check for validity, since we still need to update the offsets
+        auto newEntry = ListVector::addList(&outputVector, nextOffset - curOffset);
+        outputVector.setValue<list_entry_t>(i + dstOffset, newEntry);
+        if (i == 0) {
+            auxDstPosition = newEntry.offset;
         }
     }
     ValueVector* auxiliaryBuffer = ListVector::getDataVector(&outputVector);

From 67701846042cdad115b98fb5d013c73b88084aa5 Mon Sep 17 00:00:00 2001
From: mxwli <maxwli@outlook.com>
Date: Fri, 12 Apr 2024 16:06:25 -0400
Subject: [PATCH 4/5] add struct fixes

---
 src/common/arrow/arrow_array_scan.cpp     |  4 ++--
 src/common/arrow/arrow_null_mask_tree.cpp |  4 ++--
 tools/python_api/test/test_df_pyarrow.py  | 23 +++++++++++++++++++++++
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/common/arrow/arrow_array_scan.cpp b/src/common/arrow/arrow_array_scan.cpp
index ea83ee44a2..33d8bf51b7 100644
--- a/src/common/arrow/arrow_array_scan.cpp
+++ b/src/common/arrow/arrow_array_scan.cpp
@@ -222,8 +222,8 @@ static void scanArrowArrayStruct(const ArrowSchema* schema, const ArrowArray* ar
     }
     for (int64_t j = 0; j < schema->n_children; j++) {
         ArrowConverter::fromArrowArray(schema->children[j], array->children[j],
-            *StructVector::getFieldVector(&outputVector, j).get(), mask->getChild(j), srcOffset,
-            dstOffset, count);
+            *StructVector::getFieldVector(&outputVector, j).get(), mask->getChild(j),
+            srcOffset + array->children[j]->offset, dstOffset, count);
     }
 }
 
diff --git a/src/common/arrow/arrow_null_mask_tree.cpp b/src/common/arrow/arrow_null_mask_tree.cpp
index 4644998746..a322051bca 100644
--- a/src/common/arrow/arrow_null_mask_tree.cpp
+++ b/src/common/arrow/arrow_null_mask_tree.cpp
@@ -63,8 +63,8 @@ void ArrowNullMaskTree::scanListPushDown(const ArrowSchema* schema, const ArrowA
 void ArrowNullMaskTree::scanStructPushDown(const ArrowSchema* schema, const ArrowArray* array,
     uint64_t srcOffset, uint64_t count) {
     for (int64_t i = 0; i < array->n_children; i++) {
-        children->push_back(ArrowNullMaskTree(schema->children[i], array->children[i], srcOffset,
-            count, mask.get()));
+        children->push_back(ArrowNullMaskTree(schema->children[i], array->children[i],
+            srcOffset + array->children[i]->offset, count, mask.get()));
     }
 }
 
diff --git a/tools/python_api/test/test_df_pyarrow.py b/tools/python_api/test/test_df_pyarrow.py
index 150f8d6bd7..384771fad8 100644
--- a/tools/python_api/test/test_df_pyarrow.py
+++ b/tools/python_api/test/test_df_pyarrow.py
@@ -337,6 +337,29 @@ def test_pyarrow_struct(conn_db_readonly : ConnDB) -> None:
 
 def test_pyarrow_struct_offset(conn_db_readonly : ConnDB) -> None:
     conn, db = conn_db_readonly
+    random.seed(100)
+    datalength = 4096
+    index = pa.array(range(datalength))
+    val1 = pa.array([generate_primitive('int32[pyarrow]') for _ in range(datalength+1)])
+    val2 = pa.array([generate_primitive('bool[pyarrow]') for _ in range(datalength+2)])
+    val3 = pa.array([generate_string(random.randint(5, 10)) for _ in range(datalength+3)])
+    mask = pa.array([random.choice([True, False]) for _ in range(datalength)])
+    col1 = pa.StructArray.from_arrays([val1.slice(1, datalength), val2.slice(2, datalength), val3.slice(3, datalength)], names=['a', 'b', 'c'], mask=mask)
+    df = pd.DataFrame({
+        'index': arrowtopd(index),
+        'col1': arrowtopd(col1)
+    })
+    result = conn.execute('LOAD FROM df RETURN * ORDER BY index')
+    idx = 0
+    while result.has_next():
+        assert idx < len(index)
+        nxt = result.get_next()
+        expected = [idx, col1[idx].as_py()]
+        assert expected == nxt
+        idx += 1
+    
+    assert idx == len(index)
+
 
 def test_pyarrow_union(conn_db_readonly : ConnDB) -> None:
     pytest.skip("unions are not very well supported by kuzu in general")

From 9bd484d7b6131efc7c9ce1c9b360d65e19b694b4 Mon Sep 17 00:00:00 2001
From: mxwli <maxwli@outlook.com>
Date: Fri, 12 Apr 2024 16:27:44 -0400
Subject: [PATCH 5/5] add map tests

---
 tools/python_api/test/test_df_pyarrow.py | 26 ++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tools/python_api/test/test_df_pyarrow.py b/tools/python_api/test/test_df_pyarrow.py
index 384771fad8..95e0d04deb 100644
--- a/tools/python_api/test/test_df_pyarrow.py
+++ b/tools/python_api/test/test_df_pyarrow.py
@@ -415,3 +415,29 @@ def test_pyarrow_map(conn_db_readonly : ConnDB) -> None:
 
 def test_pyarrow_map_offset(conn_db_readonly : ConnDB) -> None:
     conn, db = conn_db_readonly
+    random.seed(100)
+    datalength = 50
+    maplength = 5
+    index = pa.array(range(datalength))
+    offsets = sorted([random.randint(0, datalength * maplength + 1) for _ in range(datalength + 1)])
+    offsets[25] = None
+    offsets = pa.array(offsets, type=pa.int32())
+    keys = pa.array([random.randint(0, (1<<31)-1) for _ in range(datalength * maplength + 1)])
+    values = pa.array([generate_primitive('int64[pyarrow]') for _ in range(datalength * maplength + 1)])
+    _col1 = pa.MapArray.from_arrays(offsets, keys.slice(1, datalength * maplength), values.slice(1, datalength * maplength))
+    col1 = _col1.slice(2, 48)
+    df = pd.DataFrame({
+        'index': arrowtopd(index.slice(0, 48)),
+        'col1': arrowtopd(col1),
+    })
+    result = conn.execute('LOAD FROM df RETURN * ORDER BY index')
+    idx = 0
+    while result.has_next():
+        assert idx < len(index)
+        nxt = result.get_next()
+        expected = [idx, None if col1[idx].as_py() is None else dict(col1[idx].as_py())]
+        assert expected == nxt
+        idx += 1
+    
+    assert idx == 48
+    
\ No newline at end of file