Skip to content

Commit

Permalink
fix(python): filetype should be in read_options (#3683)
Browse files Browse the repository at this point in the history
Or it would be pass to the storage backend.

Fixes #3682
  • Loading branch information
siyuan0322 authored Mar 29, 2024
1 parent b2db3bd commit 33bc365
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 7 deletions.
4 changes: 1 addition & 3 deletions coordinator/gscoordinator/op_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,9 +972,7 @@ def _process_loader_func(loader, vineyard_endpoint, vineyard_ipc_socket):
except: # noqa: E722, pylint: disable=bare-except
storage_options = {}
read_options = {}
filetype = storage_options.get("filetype", None)
if filetype is None:
filetype = read_options.get("filetype", None)
filetype = read_options.get("filetype", None)
filetype = str(filetype).upper()
if (
protocol in ("hdfs", "hive", "oss", "s3")
Expand Down
9 changes: 5 additions & 4 deletions python/graphscope/framework/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__(self) -> None:
# If true, column names will be read from the first CSV row
# If false, column names will be of the form "f0", "f1"...
self.header_row = True
self.filetype = "CSV"

def to_dict(self) -> Dict:
options = {}
Expand All @@ -80,6 +81,7 @@ def to_dict(self) -> Dict:
options["column_types"] = ",".join(cpp_types)
if self.force_include_all:
options["include_all_columns"] = self.force_include_all
options["filetype"] = self.filetype
return options

def __str__(self) -> str:
Expand All @@ -95,7 +97,7 @@ class Loader(object):
"""

def __init__(
self, source, delimiter=",", sep=",", header_row=True, filetype=None, **kwargs
self, source, delimiter=",", sep=",", header_row=True, filetype="CSV", **kwargs
):
"""Initialize a loader with configurable options.
Note: Loader cannot be reused since it may change inner state when constructing
Expand Down Expand Up @@ -146,13 +148,13 @@ def __init__(
)
self.options.delimiter = delimiter
self.options.header_row = header_row
self.options.filetype = filetype
# meta for data source is numpy or dataframe
self.deduced_properties = None
# extra args directly passed to storage system
# find more details in fsspec
# https://filesystem-spec.readthedocs.io/en/latest/
self.storage_options = kwargs
self.storage_options["filetype"] = filetype
# also parse protocol and source in `resolve` method
self.resolve(source)

Expand Down Expand Up @@ -255,8 +257,7 @@ def get_attr(self):
self.source.endswith(".orc")
or self.source.endswith(".parquet")
or self.source.endswith(".pq")
or str(self.storage_options.get("filetype")).upper()
in ["ORC", "PARQUET"]
or str(self.options.filetype).upper() in ["ORC", "PARQUET"]
):
# orc and parquet: handled by vineyard
config[types_pb2.SOURCE] = utils.s_to_attr(self.source)
Expand Down

0 comments on commit 33bc365

Please sign in to comment.