Skip to content

Commit

Permalink
Fix unicode conversion for pandas dataframe (#3029)
Browse files Browse the repository at this point in the history
  • Loading branch information
mewim committed Mar 12, 2024
1 parent 3bdc752 commit 18c2c8f
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 17 deletions.
11 changes: 4 additions & 7 deletions tools/python_api/src_cpp/py_query_result_converter.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include "include/py_query_result_converter.h"

#include "common/types/value/value.h"
#include "cached_import/py_cached_import.h"
#include "common/types/value/value.h"
#include "include/py_query_result.h"

using namespace kuzu::common;
Expand Down Expand Up @@ -79,10 +79,8 @@ void NPArrayWrapper::appendElement(Value* value) {
} break;
case LogicalTypeID::STRING: {
auto val = value->getValue<std::string>();
auto result = PyUnicode_New(val.length(), 127);
auto target_data = PyUnicode_DATA(result);
memcpy(target_data, val.c_str(), val.length());
((PyObject**)dataBuffer)[numElements] = result;
py::str result(val);
((py::str*)dataBuffer)[numElements] = result;
} break;
case LogicalTypeID::BLOB: {
((py::bytes*)dataBuffer)[numElements] = PyQueryResult::convertValueToPyObject(*value);
Expand Down Expand Up @@ -211,8 +209,7 @@ py::object QueryResultConverter::toDF() {
auto fromDict = importCache->pandas.DataFrame.from_dict();

for (auto i = 0u; i < colNames.size(); i++) {
result[colNames[i].c_str()] =
maskedArray(columns[i]->data, columns[i]->mask);
result[colNames[i].c_str()] = maskedArray(columns[i]->data, columns[i]->mask);
}
return fromDict(result);
}
83 changes: 73 additions & 10 deletions tools/python_api/test/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,16 @@ def _test_person_to_df(conn: kuzu.Connection) -> None:
assert str(pd["p.fName"].dtype) == "object"
assert pd["p.gender"].tolist() == [1, 2, 1, 2, 1, 2, 2, 2]
assert str(pd["p.gender"].dtype) == "int64"
assert pd["p.isStudent"].tolist() == [True, True, False, False, False, True, False, False]
assert pd["p.isStudent"].tolist() == [
True,
True,
False,
False,
False,
True,
False,
False,
]
assert str(pd["p.isStudent"].dtype) == "bool"
assert pd["p.eyeSight"].tolist() == [5.0, 5.1, 5.0, 4.8, 4.7, 4.5, 4.9, 4.9]
assert str(pd["p.eyeSight"].dtype) == "float64"
Expand Down Expand Up @@ -143,7 +152,11 @@ def _test_study_at_to_df(conn: kuzu.Connection) -> None:
assert str(pd["r.ulength"].dtype) == "uint16"
assert pd["r.ulevel"].tolist() == [250, 12, 220]
assert str(pd["r.ulevel"].dtype) == "uint8"
assert pd["r.hugedata"].tolist() == [1.8446744073709552e27, -15.0, -1.8446744073709552e21]
assert pd["r.hugedata"].tolist() == [
1.8446744073709552e27,
-15.0,
-1.8446744073709552e21,
]
assert str(pd["r.hugedata"].dtype) == "float64"

def _test_timestamps_to_df(conn: kuzu.Connection) -> None:
Expand Down Expand Up @@ -214,9 +227,17 @@ def _test_movies_to_df(conn: kuzu.Connection) -> None:
},
]
assert str(pd["m.description"].dtype) == "object"
assert pd["m.content"].tolist() == [b"\xaa\xabinteresting\x0b", b"pure ascii characters", b"\xab\xcd"]
assert pd["m.content"].tolist() == [
b"\xaa\xabinteresting\x0b",
b"pure ascii characters",
b"\xab\xcd",
]
assert str(pd["m.content"].dtype) == "object"
assert pd["m.audience"].tolist() == [{"audience1": 52, "audience53": 42}, {}, {"audience1": 33}]
assert pd["m.audience"].tolist() == [
{"audience1": 52, "audience53": 42},
{},
{"audience1": 33},
]
assert str(pd["m.audience"].dtype) == "object"
assert pd["m.grade"].tolist() == [True, 254.0, 8.989]
assert str(pd["m.grade"].dtype) == "object"
Expand Down Expand Up @@ -294,7 +315,16 @@ def test_df_get_node(establish_connection: ConnDB) -> None:
Timedelta("3750 days 13:00:00.000024"),
Timedelta("1082 days 13:02:00"),
],
"workedHours": [[10, 5], [12, 8], [4, 5], [1, 9], [2], [3, 4, 5, 6, 7], [1], [10, 11, 12, 3, 4, 5, 6, 7]],
"workedHours": [
[10, 5],
[12, 8],
[4, 5],
[1, 9],
[2],
[3, 4, 5, 6, 7],
[1],
[10, 11, 12, 3, 4, 5, 6, 7],
],
"usedNames": [
["Aida"],
["Bobby"],
Expand All @@ -315,7 +345,16 @@ def test_df_get_node(establish_connection: ConnDB) -> None:
[[10]],
[[7], [10], [6, 7]],
],
"_label": ["person", "person", "person", "person", "person", "person", "person", "person"],
"_label": [
"person",
"person",
"person",
"person",
"person",
"person",
"person",
"person",
],
}
for i in range(len(p_list)):
p = p_list[i]
Expand All @@ -340,7 +379,11 @@ def test_df_get_node_rel(establish_connection: ConnDB) -> None:
"gender": [1, 2, 1],
"isStudent": [False, False, False],
"eyeSight": [5.0, 4.8, 4.7],
"birthdate": [datetime.date(1940, 6, 22), datetime.date(1950, 7, 23), datetime.date(1980, 10, 26)],
"birthdate": [
datetime.date(1940, 6, 22),
datetime.date(1950, 7, 23),
datetime.date(1980, 10, 26),
],
"registerTime": [
Timestamp("1911-08-20 02:32:21"),
Timestamp("2031-11-30 12:25:30"),
Expand All @@ -367,7 +410,11 @@ def test_df_get_node_rel(establish_connection: ConnDB) -> None:
"orgCode": [934, 824, 824],
"mark": [4.1, 4.1, 4.1],
"score": [-100, 7, 7],
"history": ["2 years 4 days 10 hours", "2 years 4 hours 22 us 34 minutes", "2 years 4 hours 22 us 34 minutes"],
"history": [
"2 years 4 days 10 hours",
"2 years 4 hours 22 us 34 minutes",
"2 years 4 hours 22 us 34 minutes",
],
"licenseValidInterval": [
Timedelta(days=9414),
Timedelta(days=3, seconds=36000, microseconds=100000),
Expand Down Expand Up @@ -430,7 +477,10 @@ def test_df_get_recursive_join(establish_connection: ConnDB) -> None:
"notes": 1,
"summary": {
"locations": ["'toronto'", "'waterloo'"],
"transfer": {"amount": [100, 200], "day": datetime.date(2021, 1, 2)},
"transfer": {
"amount": [100, 200],
"day": datetime.date(2021, 1, 2),
},
},
"someMap": {"a": "b"},
"validInterval": datetime.timedelta(days=3750, seconds=46800, microseconds=24),
Expand All @@ -445,7 +495,10 @@ def test_df_get_recursive_join(establish_connection: ConnDB) -> None:
"notes": 1,
"summary": {
"locations": ["'toronto'", "'waterloo'"],
"transfer": {"amount": [100, 200], "day": datetime.date(2021, 1, 2)},
"transfer": {
"amount": [100, 200],
"day": datetime.date(2021, 1, 2),
},
},
"someMap": {"a": "b"},
"validInterval": datetime.timedelta(days=3750, seconds=46800, microseconds=24),
Expand Down Expand Up @@ -475,3 +528,13 @@ def test_get_rdf_variant(establish_connection: ConnDB) -> None:
datetime.timedelta(days=2),
b"\xb2",
]


def test_get_df_unicode(establish_connection: ConnDB) -> None:
conn, _ = establish_connection
res = conn.execute("MATCH (m:movies) RETURN m.name").get_as_df()
assert res["m.name"].tolist() == [
"Sóló cón tu párejâ",
"The 😂😃🧘🏻‍♂️🌍🌦️🍞🚗 movie",
"Roma",
]

0 comments on commit 18c2c8f

Please sign in to comment.