Skip to content

Commit

Permalink
remove DataTypeInfo and use LogicalType and column names (#2539)
Browse files Browse the repository at this point in the history
* remove DataTypeInfo and use LogicalType and column names

* fix mistake due to merge

* partial fix

* fix rebase errors

* Run clang-format

* fix array

* Run clang-format

* fix bug...

* add requested changes

* clang-tidy

---------

Co-authored-by: Russell Liu <russell.liu@uwaterloo.ca>
Co-authored-by: mxwli <maxwli@outlook.com>
Co-authored-by: CI Bot <mxwli@users.noreply.github.com>
  • Loading branch information
4 people committed Apr 9, 2024
1 parent 966809e commit 0b70b02
Show file tree
Hide file tree
Showing 11 changed files with 295 additions and 333 deletions.
4 changes: 2 additions & 2 deletions src/common/arrow/arrow_array_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ static void scanArrowArrayFixedList(const ArrowSchema* schema, const ArrowArray*
ValueVector& outputVector, ArrowNullMaskTree* mask, uint64_t srcOffset, uint64_t dstOffset,
uint64_t count) {
mask->copyToValueVector(&outputVector, dstOffset, count);
int64_t numValuesInList = FixedListType::getNumValuesInList(&outputVector.dataType);
int64_t numValuesInList = ArrayType::getNumValuesInList(&outputVector.dataType);
ArrowConverter::fromArrowArray(schema->children[0], array->children[0], outputVector,
mask->getChild(0), srcOffset * numValuesInList, dstOffset * numValuesInList,
count * numValuesInList);
Expand Down Expand Up @@ -499,7 +499,7 @@ void ArrowConverter::fromArrowArray(const ArrowSchema* schema, const ArrowArray*
return scanArrowArrayList<int64_t>(schema, array, outputVector, mask, srcOffset,
dstOffset, count);
case 'w':
// FIXED_LIST
// ARRAY
// TODO Manh: Array Scanning
KU_UNREACHABLE;
// return scanArrowArrayFixedList(
Expand Down
66 changes: 46 additions & 20 deletions src/common/arrow/arrow_converter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,10 @@ void ArrowConverter::initializeChild(ArrowSchema& child, const std::string& name
}

void ArrowConverter::setArrowFormatForStruct(ArrowSchemaHolder& rootHolder, ArrowSchema& child,
const main::DataTypeInfo& typeInfo) {
auto& childrenTypesInfo = typeInfo.childrenTypesInfo;
const LogicalType& dataType) {
child.format = "+s";
// name is set by parent.
child.n_children = (std::int64_t)childrenTypesInfo.size();
child.n_children = (std::int64_t)StructType::getNumFields(&dataType);
rootHolder.nestedChildren.emplace_back();
rootHolder.nestedChildren.back().resize(child.n_children);
rootHolder.nestedChildrenPtr.emplace_back();
Expand All @@ -57,14 +56,36 @@ void ArrowConverter::setArrowFormatForStruct(ArrowSchemaHolder& rootHolder, Arro
child.children = &rootHolder.nestedChildrenPtr.back()[0];
for (auto i = 0u; i < child.n_children; i++) {
initializeChild(*child.children[i]);
child.children[i]->name = copyName(rootHolder, childrenTypesInfo[i]->name);
setArrowFormat(rootHolder, *child.children[i], *childrenTypesInfo[i]);
auto structField = StructType::getField(&dataType, i);
child.children[i]->name = copyName(rootHolder, structField->getName());
setArrowFormat(rootHolder, *child.children[i], *structField->getType());
}
}

void ArrowConverter::setArrowFormatForInternalID(ArrowSchemaHolder& rootHolder, ArrowSchema& child,
const LogicalType& /*dataType*/) {
child.format = "+s";
// name is set by parent.
child.n_children = 2;
rootHolder.nestedChildren.emplace_back();
rootHolder.nestedChildren.back().resize(child.n_children);
rootHolder.nestedChildrenPtr.emplace_back();
rootHolder.nestedChildrenPtr.back().resize(child.n_children);
for (auto i = 0u; i < child.n_children; i++) {
rootHolder.nestedChildrenPtr.back()[i] = &rootHolder.nestedChildren.back()[i];
}
child.children = &rootHolder.nestedChildrenPtr.back()[0];
initializeChild(*child.children[0]);
child.children[0]->name = copyName(rootHolder, "table");
setArrowFormat(rootHolder, *child.children[0], *LogicalType::INT64());
initializeChild(*child.children[1]);
child.children[1]->name = copyName(rootHolder, "offset");
setArrowFormat(rootHolder, *child.children[1], *LogicalType::INT64());
}

void ArrowConverter::setArrowFormat(ArrowSchemaHolder& rootHolder, ArrowSchema& child,
const main::DataTypeInfo& typeInfo) {
switch (typeInfo.typeID) {
const LogicalType& dataType) {
switch (dataType.getLogicalTypeID()) {
case LogicalTypeID::BOOL: {
child.format = "b";
} break;
Expand Down Expand Up @@ -137,10 +158,10 @@ void ArrowConverter::setArrowFormat(ArrowSchemaHolder& rootHolder, ArrowSchema&
initializeChild(rootHolder.nestedChildren.back()[0]);
child.children = &rootHolder.nestedChildrenPtr.back()[0];
child.children[0]->name = "l";
setArrowFormat(rootHolder, **child.children, *typeInfo.childrenTypesInfo[0]);
setArrowFormat(rootHolder, **child.children, *ListType::getChildType(&dataType));
} break;
case LogicalTypeID::ARRAY: {
auto numValuesPerArray = "+w:" + std::to_string(typeInfo.fixedNumValues);
auto numValuesPerArray = "+w:" + std::to_string(ArrayType::getNumElements(&dataType));
child.format = copyName(rootHolder, numValuesPerArray);
child.n_children = 1;
rootHolder.nestedChildren.emplace_back();
Expand All @@ -150,25 +171,27 @@ void ArrowConverter::setArrowFormat(ArrowSchemaHolder& rootHolder, ArrowSchema&
initializeChild(rootHolder.nestedChildren.back()[0]);
child.children = &rootHolder.nestedChildrenPtr.back()[0];
child.children[0]->name = "l";
setArrowFormat(rootHolder, **child.children, *typeInfo.childrenTypesInfo[0]);
setArrowFormat(rootHolder, **child.children, *ArrayType::getChildType(&dataType));
} break;
case LogicalTypeID::STRUCT:
case LogicalTypeID::INTERNAL_ID:
case LogicalTypeID::NODE:
case LogicalTypeID::REL: {
setArrowFormatForStruct(rootHolder, child, typeInfo);
} break;
case LogicalTypeID::REL:
setArrowFormatForStruct(rootHolder, child, dataType);
break;
case LogicalTypeID::INTERNAL_ID:
setArrowFormatForInternalID(rootHolder, child, dataType);
break;
default:
KU_UNREACHABLE;
}
}

std::unique_ptr<ArrowSchema> ArrowConverter::toArrowSchema(
const std::vector<std::unique_ptr<main::DataTypeInfo>>& typesInfo) {
const std::vector<LogicalType>& dataTypes, const std::vector<std::string>& columnNames) {
auto outSchema = std::make_unique<ArrowSchema>();
auto rootHolder = std::make_unique<ArrowSchemaHolder>();

auto columnCount = (int64_t)typesInfo.size();
auto columnCount = (int64_t)dataTypes.size();
rootHolder->children.resize(columnCount);
rootHolder->childrenPtrs.resize(columnCount);
for (auto i = 0u; i < columnCount; i++) {
Expand All @@ -186,8 +209,8 @@ std::unique_ptr<ArrowSchema> ArrowConverter::toArrowSchema(
for (auto i = 0u; i < columnCount; i++) {
auto& child = rootHolder->children[i];
initializeChild(child);
child.name = copyName(*rootHolder, typesInfo[i]->name);
setArrowFormat(*rootHolder, child, *typesInfo[i]);
child.name = copyName(*rootHolder, columnNames[i]);
setArrowFormat(*rootHolder, child, dataTypes[i]);
}

outSchema->private_data = rootHolder.release();
Expand All @@ -197,8 +220,11 @@ std::unique_ptr<ArrowSchema> ArrowConverter::toArrowSchema(

void ArrowConverter::toArrowArray(main::QueryResult& queryResult, ArrowArray* outArray,
std::int64_t chunkSize) {
auto typesInfo = queryResult.getColumnTypesInfo();
auto rowBatch = make_unique<ArrowRowBatch>(std::move(typesInfo), chunkSize);
std::vector<LogicalType> types;
for (auto type : queryResult.getColumnDataTypes()) {
types.push_back(type);
}
auto rowBatch = make_unique<ArrowRowBatch>(std::move(types), chunkSize);
*outArray = rowBatch->append(queryResult, chunkSize);
}

Expand Down
Loading

0 comments on commit 0b70b02

Please sign in to comment.