Skip to content

Commit

Permalink
Update get_as_pl (should always return a single chunk) (#3110)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie committed Mar 22, 2024
1 parent bdae55f commit f9e1b12
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 22 deletions.
33 changes: 13 additions & 20 deletions tools/python_api/src_py/query_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,20 +123,10 @@ def get_as_df(self) -> pd.DataFrame:

return self._query_result.getAsDF()

def get_as_pl(self, chunk_size: int | None = None) -> pl.DataFrame:
def get_as_pl(self) -> pl.DataFrame:
"""
Get the query result as a Polars DataFrame.
Parameters
----------
chunk_size : Number of rows to include in each chunk.
None
The chunk size is adaptive and depends on the number of columns in the query result.
-1 or 0
The entire result is returned as a single chunk.
> 0
The chunk size is the number of elements specified.
See Also
--------
get_as_df : Get the query result as a Pandas DataFrame.
Expand All @@ -151,7 +141,11 @@ def get_as_pl(self, chunk_size: int | None = None) -> pl.DataFrame:

self.check_for_query_result_close()

return pl.from_arrow(data=self.get_as_arrow(chunk_size=chunk_size))
# note: polars should always export just a single chunk,
# (eg: "-1") otherwise it will just need to rechunk anyway
return pl.from_arrow( # type: ignore[return-value]
data=self.get_as_arrow(chunk_size=-1),
)

def get_as_arrow(self, chunk_size: int | None = None) -> pa.Table:
"""
Expand All @@ -165,7 +159,7 @@ def get_as_arrow(self, chunk_size: int | None = None) -> pa.Table:
-1 or 0
The entire result is returned as a single chunk.
> 0
The chunk size is the number of elements specified.
The chunk size is the number of rows specified.
See Also
--------
Expand All @@ -180,16 +174,15 @@ def get_as_arrow(self, chunk_size: int | None = None) -> pa.Table:
self.check_for_query_result_close()

if chunk_size is None:
# Adaptive chunk_size; target number of elements per chunk_size
target_chunk_size = max(1_000_000 // len(self.get_column_names()), 10)
# Adaptive; target 10m total elements in each chunk.
# (eg: if we had 10 cols, this would result in a 1m row chunk_size).
target_n_elems = 10_000_000
chunk_size = max(target_n_elems // len(self.get_column_names()), 10)
elif chunk_size <= 0:
# No chunking: return the entire result as a single chunk
target_chunk_size = self.get_num_tuples()
else:
# Chunk size is the number of elements specified
target_chunk_size = chunk_size
chunk_size = self.get_num_tuples()

return self._query_result.getAsArrow(target_chunk_size)
return self._query_result.getAsArrow(chunk_size)

def get_column_data_types(self) -> list[str]:
"""
Expand Down
7 changes: 5 additions & 2 deletions tools/python_api/test/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,11 @@


def get_result(query_result: kuzu.QueryResult, result_type: str, chunk_size: int | None) -> Any:
sz = [] if chunk_size is None else [chunk_size]
return getattr(query_result, f"get_as_{result_type}")(*sz)
sz = [] if (chunk_size is None or result_type == "pl") else [chunk_size]
res = getattr(query_result, f"get_as_{result_type}")(*sz)
if result_type == "arrow" and chunk_size:
assert res[0].num_chunks == max((len(res) // chunk_size), 1)
return res


def assert_column_equals(data: Any, col_name: str, return_type: str, expected_values: list[Any]) -> None:
Expand Down

0 comments on commit f9e1b12

Please sign in to comment.