Skip to content

Commit

Permalink
fix pyarrow segfaulting on fedora 39 (#3213)
Browse files Browse the repository at this point in the history
  • Loading branch information
mxwli committed Apr 5, 2024
1 parent 8923c7f commit 33111c8
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 15 deletions.
10 changes: 5 additions & 5 deletions tools/python_api/src_cpp/include/pyarrow/pyarrow_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,30 @@ struct PyArrowTableScanSharedState final : public function::BaseScanSharedState
std::mutex lock;

PyArrowTableScanSharedState(
uint64_t numRows, std::vector<std::shared_ptr<ArrowArrayWrapper>>&& chunks)
uint64_t numRows, std::vector<std::shared_ptr<ArrowArrayWrapper>> chunks)
: BaseScanSharedState{numRows}, chunks{std::move(chunks)}, currentChunk{0} {}

ArrowArrayWrapper* getNextChunk();
};

struct PyArrowTableScanFunctionData final : public function::TableFuncBindData {
std::shared_ptr<ArrowSchemaWrapper> schema;
py::object table;
std::vector<std::shared_ptr<ArrowArrayWrapper>> arrowArrayBatches;
uint64_t numRows;

PyArrowTableScanFunctionData(std::vector<common::LogicalType> columnTypes,
std::shared_ptr<ArrowSchemaWrapper> schema, std::vector<std::string> columnNames,
py::handle table, uint64_t numRows)
std::vector<std::shared_ptr<ArrowArrayWrapper>> arrowArrayBatches, uint64_t numRows)
: TableFuncBindData{std::move(columnTypes), std::move(columnNames)},
schema{std::move(schema)}, table{py::reinterpret_borrow<py::object>(table)}, numRows{numRows} {}
schema{std::move(schema)}, arrowArrayBatches{arrowArrayBatches}, numRows{numRows} {}

~PyArrowTableScanFunctionData() override {}

std::unique_ptr<function::TableFuncBindData> copy() const override {
py::gil_scoped_acquire acquire;
// the schema is considered immutable so copying it by copying the shared_ptr is fine.
return std::make_unique<PyArrowTableScanFunctionData>(
columnTypes, schema, columnNames, *table, numRows);
columnTypes, schema, columnNames, arrowArrayBatches, numRows);
}
};

Expand Down
2 changes: 1 addition & 1 deletion tools/python_api/src_cpp/pandas/pandas_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,9 @@ static std::unique_ptr<ScanReplacementData> tryReplacePD(py::dict& dict, py::str
}

std::unique_ptr<ScanReplacementData> replacePD(const std::string& objectName) {
py::gil_scoped_acquire acquire;
auto pyTableName = py::str(objectName);
// Here we do an exhaustive search on the frame lineage.
py::gil_scoped_acquire acquire;
auto currentFrame = importCache->inspect.currentframe()();
while (hasattr(currentFrame, "f_locals")) {
auto localDict = py::reinterpret_borrow<py::dict>(currentFrame.attr("f_locals"));
Expand Down
19 changes: 10 additions & 9 deletions tools/python_api/src_cpp/pyarrow/pyarrow_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,16 @@ static std::unique_ptr<function::TableFuncBindData> bindFunc(
}
auto numRows = py::len(table);
auto schema = Pyarrow::bind(table, returnTypes, names);

py::list batches = table.attr("to_batches")(DEFAULT_VECTOR_CAPACITY);
std::vector<std::shared_ptr<ArrowArrayWrapper>> arrowArrayBatches;
for (auto& i: batches) {
arrowArrayBatches.push_back(std::make_shared<ArrowArrayWrapper>());
i.attr("_export_to_c")(reinterpret_cast<uint64_t>(arrowArrayBatches.back().get()));
}

return std::make_unique<PyArrowTableScanFunctionData>(
std::move(returnTypes), std::move(schema), std::move(names), std::move(table), numRows);
std::move(returnTypes), std::move(schema), std::move(names), arrowArrayBatches, numRows);
}

ArrowArrayWrapper* PyArrowTableScanSharedState::getNextChunk() {
Expand All @@ -46,16 +54,9 @@ static std::unique_ptr<function::TableFuncSharedState> initSharedState(
py::gil_scoped_acquire acquire;
PyArrowTableScanFunctionData* bindData =
dynamic_cast<PyArrowTableScanFunctionData*>(input.bindData);
py::list batches = bindData->table.attr("to_batches")(DEFAULT_VECTOR_CAPACITY);
std::vector<std::shared_ptr<ArrowArrayWrapper>> arrowArrayBatches;

for (auto& i : batches) {
arrowArrayBatches.push_back(std::make_shared<ArrowArrayWrapper>());
i.attr("_export_to_c")(reinterpret_cast<uint64_t>(arrowArrayBatches.back().get()));
}

return std::make_unique<PyArrowTableScanSharedState>(
bindData->numRows, std::move(arrowArrayBatches));
bindData->numRows, bindData->arrowArrayBatches);
}

static std::unique_ptr<function::TableFuncLocalState> initLocalState(
Expand Down

0 comments on commit 33111c8

Please sign in to comment.