Skip to content

Commit

Permalink
Add Pyarrow Map Scanning (#3158)
Browse files Browse the repository at this point in the history
* add map scanning

* clang format and python lint

* changed decimal error message
  • Loading branch information
mxwli committed Mar 28, 2024
1 parent 2ec13b2 commit 08fd180
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 8 deletions.
5 changes: 2 additions & 3 deletions src/common/arrow/arrow_array_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,9 +510,8 @@ void ArrowConverter::fromArrowArray(const ArrowSchema* schema, const ArrowArray*
schema, array, outputVector, mask, srcOffset, dstOffset, count);
case 'm':
// MAP
// TODO maxwell;
// for some reason the columnar format specification doesn't mention maps at all
KU_UNREACHABLE;
return scanArrowArrayVarList<int32_t>(
schema, array, outputVector, mask, srcOffset, dstOffset, count);
case 'u':
if (arrowType[2] == 'd') {
// DENSE UNION
Expand Down
6 changes: 4 additions & 2 deletions src/common/arrow/arrow_null_mask_tree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,10 @@ ArrowNullMaskTree::ArrowNullMaskTree(const ArrowSchema* schema, const ArrowArray
scanStructPushDown(schema, array, srcOffset, count);
break;
case 'm':
// TODO maxwell bind map types
KU_UNREACHABLE;
copyFromBuffer(array->buffers[0], srcOffset, count);
applyParentBitmap(parentBitmap, count);
scanListPushDown<int32_t>(schema, array, srcOffset, count);
break;
case 'u': {
const int8_t* types = (const int8_t*)array->buffers[0];
if (schema->format[2] == 'd') {
Expand Down
7 changes: 4 additions & 3 deletions src/common/arrow/arrow_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ LogicalType ArrowConverter::fromArrowSchema(const ArrowSchema* schema) {
}

case 'd':
throw NotImplementedException("custom bitwidth decimals are not supported");
throw NotImplementedException("Decimals are not supported");
case 'w':
return LogicalType(LogicalTypeID::BLOB); // fixed width binary
case 't':
Expand Down Expand Up @@ -115,8 +115,9 @@ LogicalType ArrowConverter::fromArrowSchema(const ArrowSchema* schema) {
}
return *LogicalType::STRUCT(std::move(structFields));
case 'm':
// TODO maxwell bind map types
throw NotImplementedException("Scanning Arrow Map types is not supported");
return *LogicalType::MAP(
std::make_unique<LogicalType>(fromArrowSchema(schema->children[0]->children[0])),
std::make_unique<LogicalType>(fromArrowSchema(schema->children[0]->children[1])));
case 'u':
throw RuntimeException("Unions are currently WIP.");
for (int64_t i = 0; i < schema->n_children; i++) {
Expand Down
25 changes: 25 additions & 0 deletions tools/python_api/test/test_df_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,3 +313,28 @@ def test_pyarrow_union(conn_db_readonly : ConnDB) -> None:

assert idx == len(index)

def test_pyarrow_map(conn_db_readonly : ConnDB) -> None:
conn, db = conn_db_readonly
random.seed(100)
datalength = 4096
index = pa.array(range(datalength))
keySet = range(100)
valueSet = 'abcdefghijklmnopqrstuvwxyz'
col1 = pa.array([
{str(key) : ''.join(random.sample(valueSet, random.randint(0, len(valueSet))))
for key in random.sample(keySet, random.randint(1, len(keySet)))}
if random.randint(0, 5) != 0 else None for i in range(datalength)],
type=pa.map_(pa.string(), pa.string()))
df = pd.DataFrame({
'index': arrowtopd(index),
'col1': arrowtopd(col1)})
result = conn.execute('CALL READ_PANDAS(df) RETURN * ORDER BY index')
idx = 0
while result.has_next():
assert idx < len(index)
nxt = result.get_next()
expected = [idx, None if col1[idx].as_py() is None else dict(col1[idx].as_py())]
assert expected == nxt
idx += 1

assert idx == len(index)

0 comments on commit 08fd180

Please sign in to comment.