From 1653b4026cec55a49747a17c8cb3c57ec6b0ed3c Mon Sep 17 00:00:00 2001 From: ziyi chen Date: Mon, 26 Feb 2024 09:28:55 -0500 Subject: [PATCH] Revert "Implement Python Import Caching" --- tools/python_api/CMakeLists.txt | 2 - .../cached_import/py_cached_import.cpp | 18 --- .../src_cpp/cached_import/py_cached_item.cpp | 24 ---- .../include/cached_import/py_cached_import.h | 33 ----- .../include/cached_import/py_cached_item.h | 26 ---- .../include/cached_import/py_cached_modules.h | 128 ------------------ .../python_api/src_cpp/include/py_database.h | 2 +- .../src_cpp/pandas/pandas_analyzer.cpp | 7 +- .../python_api/src_cpp/pandas/pandas_scan.cpp | 4 +- tools/python_api/src_cpp/py_connection.cpp | 15 +- tools/python_api/src_cpp/py_conversion.cpp | 12 +- tools/python_api/src_cpp/py_database.cpp | 7 - tools/python_api/src_cpp/py_query_result.cpp | 29 ++-- .../src_cpp/py_query_result_converter.cpp | 9 +- 14 files changed, 37 insertions(+), 279 deletions(-) delete mode 100644 tools/python_api/src_cpp/cached_import/py_cached_import.cpp delete mode 100644 tools/python_api/src_cpp/cached_import/py_cached_item.cpp delete mode 100644 tools/python_api/src_cpp/include/cached_import/py_cached_import.h delete mode 100644 tools/python_api/src_cpp/include/cached_import/py_cached_item.h delete mode 100644 tools/python_api/src_cpp/include/cached_import/py_cached_modules.h diff --git a/tools/python_api/CMakeLists.txt b/tools/python_api/CMakeLists.txt index 1319a8c43c..e1c3e1f4d9 100644 --- a/tools/python_api/CMakeLists.txt +++ b/tools/python_api/CMakeLists.txt @@ -9,8 +9,6 @@ file(GLOB SOURCE_PY pybind11_add_module(_kuzu SHARED src_cpp/kuzu_binding.cpp - src_cpp/cached_import/py_cached_item.cpp - src_cpp/cached_import/py_cached_import.cpp src_cpp/py_connection.cpp src_cpp/py_database.cpp src_cpp/py_prepared_statement.cpp diff --git a/tools/python_api/src_cpp/cached_import/py_cached_import.cpp b/tools/python_api/src_cpp/cached_import/py_cached_import.cpp deleted file mode 100644 index e0317ab9fd..0000000000 --- a/tools/python_api/src_cpp/cached_import/py_cached_import.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include "cached_import/py_cached_import.h" - -namespace kuzu { - -PythonCachedImport::~PythonCachedImport() { - py::gil_scoped_acquire acquire; - allObjects.clear(); -} - -py::handle PythonCachedImport::addToCache(py::object obj) { - auto ptr = obj.ptr(); - allObjects.push_back(obj); - return ptr; -} - -std::shared_ptr importCache; - -} // namespace kuzu diff --git a/tools/python_api/src_cpp/cached_import/py_cached_item.cpp b/tools/python_api/src_cpp/cached_import/py_cached_item.cpp deleted file mode 100644 index ebbc1ecb39..0000000000 --- a/tools/python_api/src_cpp/cached_import/py_cached_item.cpp +++ /dev/null @@ -1,24 +0,0 @@ -#include "cached_import/py_cached_item.h" - - -#include "cached_import/py_cached_import.h" -#include "common/exception/runtime.h" - -namespace kuzu { - -py::handle PythonCachedItem::operator()() { - assert((bool)PyGILState_Check()); - // load if unloaded, return cached object if already loaded - if (loaded) { - return object; - } - if (parent == nullptr) { - object = importCache->addToCache(std::move(py::module::import(name.c_str()))); - } else { - object = importCache->addToCache(std::move((*parent)().attr(name.c_str()))); - } - loaded = true; - return object; -} - -} // namespace kuzu diff --git a/tools/python_api/src_cpp/include/cached_import/py_cached_import.h b/tools/python_api/src_cpp/include/cached_import/py_cached_import.h deleted file mode 100644 index cd093224e6..0000000000 --- a/tools/python_api/src_cpp/include/cached_import/py_cached_import.h +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include - -#include "py_cached_modules.h" - -namespace kuzu { - -class PythonCachedImport { -public: - // Note: Callers generally acquire the GIL prior to entering functions - // that require the import cache. - - PythonCachedImport() = default; - ~PythonCachedImport(); - - py::handle addToCache(py::object obj); - - DateTimeCachedItem datetime; - DecimalCachedItem decimal; - InspectCachedItem inspect; - NumpyMaCachedItem numpyma; - PandasCachedItem pandas; - PyarrowCachedItem pyarrow; - UUIDCachedItem uuid; - -private: - std::vector allObjects; -}; - -extern std::shared_ptr importCache; - -} // namespace kuzu diff --git a/tools/python_api/src_cpp/include/cached_import/py_cached_item.h b/tools/python_api/src_cpp/include/cached_import/py_cached_item.h deleted file mode 100644 index 159220e79b..0000000000 --- a/tools/python_api/src_cpp/include/cached_import/py_cached_item.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once - -#include -#include - -#include "pybind_include.h" - -namespace kuzu { - -class PythonCachedItem { -public: - explicit PythonCachedItem(const std::string& name, PythonCachedItem* parent = nullptr) - : name(name), parent(parent), loaded(false) {} - virtual ~PythonCachedItem() = default; - - bool isLoaded() const {return loaded;} - py::handle operator()(); - -private: - std::string name; - PythonCachedItem* parent; - bool loaded; - py::handle object; -}; - -} // namespace kuzu diff --git a/tools/python_api/src_cpp/include/cached_import/py_cached_modules.h b/tools/python_api/src_cpp/include/cached_import/py_cached_modules.h deleted file mode 100644 index 6bc374df4c..0000000000 --- a/tools/python_api/src_cpp/include/cached_import/py_cached_modules.h +++ /dev/null @@ -1,128 +0,0 @@ -#pragma once - -#include "py_cached_item.h" - -namespace kuzu { - -class DateTimeCachedItem : public PythonCachedItem { - -public: - DateTimeCachedItem() : PythonCachedItem("datetime"), date("date", this), - datetime("datetime", this), timedelta("timedelta", this) {} - - PythonCachedItem date; - PythonCachedItem datetime; - PythonCachedItem timedelta; -}; - -class DecimalCachedItem : public PythonCachedItem { - -public: - DecimalCachedItem() : PythonCachedItem("decimal"), Decimal("Decimal", this) {} - - PythonCachedItem Decimal; -}; - -class InspectCachedItem : public PythonCachedItem { - -public: - InspectCachedItem() : PythonCachedItem("inspect"), currentframe("currentframe", this) {} - - PythonCachedItem currentframe; -}; - -class NumpyMaCachedItem : public PythonCachedItem { - -public: - NumpyMaCachedItem() : PythonCachedItem("numpy.ma"), masked_array("masked_array", this) {} - - PythonCachedItem masked_array; -}; - -class PandasCachedItem : public PythonCachedItem { - - class SeriesCachedItem : public PythonCachedItem { - public: - explicit SeriesCachedItem(PythonCachedItem* parent): PythonCachedItem("series", parent), - Series("Series", this) {} - - PythonCachedItem Series; - }; - - class CoreCachedItem : public PythonCachedItem { - public: - explicit CoreCachedItem(PythonCachedItem* parent): PythonCachedItem("core", parent), - series(this) {} - - SeriesCachedItem series; - }; - - class DataFrameCachedItem : public PythonCachedItem { - public: - explicit DataFrameCachedItem(PythonCachedItem* parent): PythonCachedItem("DataFrame", parent), - from_dict("from_dict", this) {} - - PythonCachedItem from_dict; - }; - -public: - PandasCachedItem() : PythonCachedItem("pandas"), core(this), DataFrame(this), NA("NA", this), - NaT("NaT", this) {} - - CoreCachedItem core; - DataFrameCachedItem DataFrame; - PythonCachedItem NA; - PythonCachedItem NaT; -}; - -class PyarrowCachedItem : public PythonCachedItem { - - class RecordBatchCachedItem : public PythonCachedItem { - public: - explicit RecordBatchCachedItem(PythonCachedItem* parent): PythonCachedItem("RecordBatch", parent), - _import_from_c("_import_from_c", this) {} - - PythonCachedItem _import_from_c; - }; - - class SchemaCachedItem : public PythonCachedItem { - public: - explicit SchemaCachedItem(PythonCachedItem* parent): PythonCachedItem("Schema", parent), - _import_from_c("_import_from_c", this) {} - - PythonCachedItem _import_from_c; - }; - - class TableCachedItem : public PythonCachedItem { - public: - explicit TableCachedItem(PythonCachedItem* parent): PythonCachedItem("Table", parent), - from_batches("from_batches", this) {} - - PythonCachedItem from_batches; - }; - - class LibCachedItem : public PythonCachedItem { - public: - explicit LibCachedItem(PythonCachedItem* parent): PythonCachedItem("lib", parent), - RecordBatch(this), Schema(this), Table(this) {} - - RecordBatchCachedItem RecordBatch; - SchemaCachedItem Schema; - TableCachedItem Table; - }; - -public: - PyarrowCachedItem(): PythonCachedItem("pyarrow"), lib(this) {} - - LibCachedItem lib; -}; - -class UUIDCachedItem : public PythonCachedItem { - -public: - UUIDCachedItem() : PythonCachedItem("uuid"), UUID("UUID", this) {} - - PythonCachedItem UUID; -}; - -} // namespace kuzu diff --git a/tools/python_api/src_cpp/include/py_database.h b/tools/python_api/src_cpp/include/py_database.h index 4ad2bf81bb..8e1ca8c066 100644 --- a/tools/python_api/src_cpp/include/py_database.h +++ b/tools/python_api/src_cpp/include/py_database.h @@ -19,7 +19,7 @@ class PyDatabase { explicit PyDatabase(const std::string& databasePath, uint64_t bufferPoolSize, uint64_t maxNumThreads, bool compression, bool readOnly); - ~PyDatabase(); + ~PyDatabase() = default; template void scanNodeTable(const std::string& tableName, const std::string& propName, diff --git a/tools/python_api/src_cpp/pandas/pandas_analyzer.cpp b/tools/python_api/src_cpp/pandas/pandas_analyzer.cpp index 354aa9a55b..5fd2af3d43 100644 --- a/tools/python_api/src_cpp/pandas/pandas_analyzer.cpp +++ b/tools/python_api/src_cpp/pandas/pandas_analyzer.cpp @@ -1,7 +1,6 @@ #include "pandas/pandas_analyzer.h" #include "function/built_in_function_utils.h" -#include "cached_import/py_cached_import.h" #include "py_conversion.h" namespace kuzu { @@ -38,7 +37,7 @@ common::LogicalType PandasAnalyzer::getListType(py::object& ele, bool& canConver for (auto pyVal : ele) { auto object = py::reinterpret_borrow(pyVal); auto itemType = getItemType(object, canConvert); - if (i == 0) { + if (i != 0) { listType = itemType; } else { if (!upgradeType(listType, itemType)) { @@ -89,8 +88,8 @@ static py::object findFirstNonNull(const py::handle& row, uint64_t numRows) { common::LogicalType PandasAnalyzer::innerAnalyze(py::object column, bool& canConvert) { auto numRows = py::len(column); - auto pandasModule = importCache->pandas; - auto pandasSeries = pandasModule.core.series.Series(); + auto pandasModule = py::module::import("pandas"); + auto pandasSeries = pandasModule.attr("core").attr("series").attr("Series"); if (py::isinstance(column, pandasSeries)) { column = column.attr("__array__")(); diff --git a/tools/python_api/src_cpp/pandas/pandas_scan.cpp b/tools/python_api/src_cpp/pandas/pandas_scan.cpp index f9473a2a9f..a8027245bc 100644 --- a/tools/python_api/src_cpp/pandas/pandas_scan.cpp +++ b/tools/python_api/src_cpp/pandas/pandas_scan.cpp @@ -1,7 +1,6 @@ #include "pandas/pandas_scan.h" #include "function/table/bind_input.h" -#include "cached_import/py_cached_import.h" #include "numpy/numpy_scan.h" #include "py_connection.h" #include "pybind11/pytypes.h" @@ -128,9 +127,10 @@ std::unique_ptr tryReplacePD(py::dict& dict, py::str& tableName) { } std::unique_ptr replacePD(common::Value* value) { + py::gil_scoped_acquire acquire; auto pyTableName = py::str(value->getValue()); // Here we do an exhaustive search on the frame lineage. - auto currentFrame = importCache->inspect.currentframe()(); + auto currentFrame = py::module::import("inspect").attr("currentframe")(); while (hasattr(currentFrame, "f_locals")) { auto localDict = py::reinterpret_borrow(currentFrame.attr("f_locals")); if (localDict) { diff --git a/tools/python_api/src_cpp/py_connection.cpp b/tools/python_api/src_cpp/py_connection.cpp index 6517cd0953..8ddef71b33 100644 --- a/tools/python_api/src_cpp/py_connection.cpp +++ b/tools/python_api/src_cpp/py_connection.cpp @@ -4,14 +4,12 @@ #include "common/string_format.h" #include "datetime.h" // from Python -#include "cached_import/py_cached_import.h" #include "main/connection.h" #include "pandas/pandas_scan.h" #include "processor/result/factorized_table.h" #include "common/types/uuid.h" using namespace kuzu::common; -using namespace kuzu; void PyConnection::initialize(py::handle& m) { py::class_(m, "Connection") @@ -153,7 +151,9 @@ void PyConnection::getAllEdgesForTorchGeometric(py::array_t& npArray, } bool PyConnection::isPandasDataframe(const py::object& object) { - return py::isinstance(object, importCache->pandas.DataFrame()); + // TODO(Ziyi): introduce PythonCachedImport to avoid unnecessary import. + py::module pandas = py::module::import("pandas"); + return py::isinstance(object, pandas.attr("DataFrame")); } static Value transformPythonValue(py::handle val); @@ -176,10 +176,11 @@ std::unordered_map> transformPythonParameter } Value transformPythonValue(py::handle val) { - auto datetime_datetime = importCache->datetime.datetime(); - auto time_delta = importCache->datetime.timedelta(); - auto datetime_date = importCache->datetime.date(); - auto uuid = importCache->uuid.UUID(); + auto datetime_mod = py::module::import("datetime"); + auto datetime_datetime = datetime_mod.attr("datetime"); + auto time_delta = datetime_mod.attr("timedelta"); + auto datetime_date = datetime_mod.attr("date"); + auto uuid = py::module::import("uuid").attr("UUID"); if (py::isinstance(val)) { return Value::createValue(val.cast()); } else if (py::isinstance(val)) { diff --git a/tools/python_api/src_cpp/py_conversion.cpp b/tools/python_api/src_cpp/py_conversion.cpp index bad439c29d..d194a4b86c 100644 --- a/tools/python_api/src_cpp/py_conversion.cpp +++ b/tools/python_api/src_cpp/py_conversion.cpp @@ -1,18 +1,18 @@ #include "py_conversion.h" #include "common/type_utils.h" -#include "cached_import/py_cached_import.h" namespace kuzu { using namespace kuzu::common; -using kuzu::importCache; PythonObjectType getPythonObjectType(py::handle& ele) { - auto pandasNa = importCache->pandas.NA(); - auto pyDateTime = importCache->datetime.datetime(); - auto pandasNat = importCache->pandas.NaT(); - auto pyDate = importCache->datetime.date(); + py::object pandas = py::module::import("pandas"); + auto pandasNa = pandas.attr("NA"); + auto pandasNat = pandas.attr("NaT"); + py::object datetime = py::module::import("datetime"); + auto pyDateTime = datetime.attr("datetime"); + auto pyDate = datetime.attr("date"); if (ele.is_none() || ele.is(pandasNa) || ele.is(pandasNat)) { return PythonObjectType::None; } else if (py::isinstance(ele)) { diff --git a/tools/python_api/src_cpp/py_database.cpp b/tools/python_api/src_cpp/py_database.cpp index 5fdc4270e9..c4d637c5d7 100644 --- a/tools/python_api/src_cpp/py_database.cpp +++ b/tools/python_api/src_cpp/py_database.cpp @@ -1,6 +1,4 @@ #include "include/py_database.h" - -#include "include/cached_import/py_cached_import.h" #include "pandas/pandas_scan.h" #include @@ -39,11 +37,6 @@ PyDatabase::PyDatabase(const std::string& databasePath, uint64_t bufferPoolSize, database = std::make_unique(databasePath, systemConfig); database->addBuiltInFunction(READ_PANDAS_FUNC_NAME, kuzu::PandasScanFunction::getFunctionSet()); storageDriver = std::make_unique(database.get()); - kuzu::importCache = std::make_shared(); -} - -PyDatabase::~PyDatabase() { - kuzu::importCache.reset(); } template diff --git a/tools/python_api/src_cpp/py_query_result.cpp b/tools/python_api/src_cpp/py_query_result.cpp index cfa88c6f54..a0c1edf388 100644 --- a/tools/python_api/src_cpp/py_query_result.cpp +++ b/tools/python_api/src_cpp/py_query_result.cpp @@ -9,11 +9,9 @@ #include "common/types/value/node.h" #include "common/types/value/rel.h" #include "datetime.h" // python lib -#include "cached_import/py_cached_import.h" #include "include/py_query_result_converter.h" using namespace kuzu::common; -using kuzu::importCache; #define PyDateTimeTZ_FromDateAndTime(year, month, day, hour, min, sec, usec, timezone) \ PyDateTimeAPI->DateTime_FromDateAndTime( \ @@ -127,7 +125,8 @@ py::object convertRdfVariantToPyObject(const Value& value) { case LogicalTypeID::INTERVAL: { auto intervalVal = RdfVariant::getValue(&value); auto days = Interval::DAYS_PER_MONTH * intervalVal.months + intervalVal.days; - return py::cast(importCache->datetime.timedelta()(py::arg("days") = days, + return py::cast(py::module::import("datetime") + .attr("timedelta")(py::arg("days") = days, py::arg("microseconds") = intervalVal.micros)); } default: { @@ -173,8 +172,7 @@ py::object PyQueryResult::convertValueToPyObject(const Value& value) { case LogicalTypeID::INT128: { kuzu::common::int128_t result = value.getValue(); std::string int128_string = kuzu::common::Int128_t::ToString(result); - - auto Decimal = importCache->decimal.Decimal(); + py::object Decimal = py::module_::import("decimal").attr("Decimal"); py::object largeInt = Decimal(int128_string); return largeInt; } @@ -195,7 +193,7 @@ py::object PyQueryResult::convertValueToPyObject(const Value& value) { case LogicalTypeID::UUID: { kuzu::common::int128_t result = value.getValue(); std::string uuidString = kuzu::common::UUID::toString(result); - auto UUID = importCache->uuid.UUID(); + py::object UUID = py::module_::import("uuid").attr("UUID"); return UUID(uuidString); } case LogicalTypeID::DATE: { @@ -238,8 +236,8 @@ py::object PyQueryResult::convertValueToPyObject(const Value& value) { case LogicalTypeID::INTERVAL: { auto intervalVal = value.getValue(); auto days = Interval::DAYS_PER_MONTH * intervalVal.months + intervalVal.days; - - return py::cast(importCache->datetime.timedelta()(py::arg("days") = days, + return py::cast(py::module::import("datetime") + .attr("timedelta")(py::arg("days") = days, py::arg("microseconds") = intervalVal.micros)); } case LogicalTypeID::VAR_LIST: @@ -333,9 +331,9 @@ bool PyQueryResult::getNextArrowChunk(const std::vectorpyarrow.lib.RecordBatch._import_from_c(); - + // TODO(Ziyi): use import cache to improve performance. + auto pyarrowLibModule = py::module::import("pyarrow").attr("lib"); + auto batchImportFunc = pyarrowLibModule.attr("RecordBatch").attr("_import_from_c"); auto schema = ArrowConverter::toArrowSchema(typesInfo); batches.append(batchImportFunc((std::uint64_t)&data, (std::uint64_t)schema.get())); return true; @@ -343,19 +341,22 @@ bool PyQueryResult::getNextArrowChunk(const std::vector>& typesInfo, std::int64_t chunkSize) { + auto pyarrowLibModule = py::module::import("pyarrow").attr("lib"); py::list batches; while (getNextArrowChunk(typesInfo, batches, chunkSize)) {} return batches; } kuzu::pyarrow::Table PyQueryResult::getAsArrow(std::int64_t chunkSize) { + py::gil_scoped_acquire acquire; + + auto pyarrowLibModule = py::module::import("pyarrow").attr("lib"); + auto fromBatchesFunc = pyarrowLibModule.attr("Table").attr("from_batches"); + auto schemaImportFunc = pyarrowLibModule.attr("Schema").attr("_import_from_c"); auto typesInfo = queryResult->getColumnTypesInfo(); py::list batches = getArrowChunks(typesInfo, chunkSize); auto schema = ArrowConverter::toArrowSchema(typesInfo); - - auto fromBatchesFunc = importCache->pyarrow.lib.Table.from_batches(); - auto schemaImportFunc = importCache->pyarrow.lib.Schema._import_from_c(); auto schemaObj = schemaImportFunc((std::uint64_t)schema.get()); return py::cast(fromBatchesFunc(batches, schemaObj)); } diff --git a/tools/python_api/src_cpp/py_query_result_converter.cpp b/tools/python_api/src_cpp/py_query_result_converter.cpp index d7e996b50d..7b2a4e5bca 100644 --- a/tools/python_api/src_cpp/py_query_result_converter.cpp +++ b/tools/python_api/src_cpp/py_query_result_converter.cpp @@ -1,11 +1,9 @@ #include "include/py_query_result_converter.h" #include "common/types/value/value.h" -#include "cached_import/py_cached_import.h" #include "include/py_query_result.h" using namespace kuzu::common; -using namespace kuzu; NPArrayWrapper::NPArrayWrapper(const LogicalType& type, uint64_t numFlatTuple) : type{type}, numElements{0} { @@ -207,12 +205,9 @@ py::object QueryResultConverter::toDF() { py::dict result; auto colNames = queryResult->getColumnNames(); - auto maskedArray = importCache->numpyma.masked_array(); - auto fromDict = importCache->pandas.DataFrame.from_dict(); - for (auto i = 0u; i < colNames.size(); i++) { result[colNames[i].c_str()] = - maskedArray(columns[i]->data, columns[i]->mask); + py::module::import("numpy.ma").attr("masked_array")(columns[i]->data, columns[i]->mask); } - return fromDict(result); + return py::module::import("pandas").attr("DataFrame").attr("from_dict")(result); }