Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/strip whitespace #1136

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def __parse_server_exception(
if response.status_code == 414:
raise OpenMLServerError("URI too long! ({})".format(url))
try:
server_exception = xmltodict.parse(response.text)
server_exception = xmltodict.parse(response.text, strip_whitespace=False)
except xml.parsers.expat.ExpatError:
raise
except Exception:
Expand Down
2 changes: 1 addition & 1 deletion openml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def publish(self) -> "OpenMLBase":
response_text = openml._api_calls._perform_api_call(
call, "post", file_elements=file_elements
)
xml_response = xmltodict.parse(response_text)
xml_response = xmltodict.parse(response_text, strip_whitespace=False)

self._parse_publish_response(xml_response)
return self
Expand Down
8 changes: 6 additions & 2 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,7 +934,9 @@ def _read_features(features_file: str) -> Dict[int, OpenMLDataFeature]:
with open(features_file, encoding="utf8") as fh:
features_xml_string = fh.read()
xml_dict = xmltodict.parse(
features_xml_string, force_list=("oml:feature", "oml:nominal_value")
features_xml_string,
strip_whitespace=False,
force_list=("oml:feature", "oml:nominal_value"),
)
features_xml = xml_dict["oml:data_features"]

Expand Down Expand Up @@ -970,7 +972,9 @@ def _read_qualities(qualities_file: str) -> Dict[str, float]:
except: # noqa E722
with open(qualities_file, encoding="utf8") as fh:
qualities_xml = fh.read()
xml_as_dict = xmltodict.parse(qualities_xml, force_list=("oml:quality",))
xml_as_dict = xmltodict.parse(
qualities_xml, strip_whitespace=False, force_list=("oml:quality",)
)
qualities = xml_as_dict["oml:data_qualities"]["oml:quality"]
qualities = _check_qualities(qualities)
with open(qualities_pickle_file, "wb") as fh_binary:
Expand Down
22 changes: 12 additions & 10 deletions openml/datasets/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def list_qualities() -> List[str]:
"""
api_call = "data/qualities/list"
xml_string = openml._api_calls._perform_api_call(api_call, "get")
qualities = xmltodict.parse(xml_string, force_list=("oml:quality"))
qualities = xmltodict.parse(xml_string, strip_whitespace=False, force_list=("oml:quality"))
# Minimalistic check if the XML is useful
if "oml:data_qualities_list" not in qualities:
raise ValueError("Error in return XML, does not contain " '"oml:data_qualities_list"')
Expand Down Expand Up @@ -181,7 +181,7 @@ def _list_datasets(data_id: Optional[List] = None, output_format="dict", **kwarg
def __list_datasets(api_call, output_format="dict"):

xml_string = openml._api_calls._perform_api_call(api_call, "get")
datasets_dict = xmltodict.parse(xml_string, force_list=("oml:dataset",))
datasets_dict = xmltodict.parse(xml_string, strip_whitespace=False, force_list=("oml:dataset",))

# Minimalistic check if the XML is useful
assert type(datasets_dict["oml:data"]["oml:dataset"]) == list, type(datasets_dict["oml:data"])
Expand Down Expand Up @@ -724,7 +724,7 @@ def status_update(data_id, status):
raise ValueError("Illegal status value. " "Legal values: %s" % legal_status)
data = {"data_id": data_id, "status": status}
result_xml = openml._api_calls._perform_api_call("data/status/update", "post", data=data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
server_data_id = result["oml:data_status_update"]["oml:id"]
server_status = result["oml:data_status_update"]["oml:status"]
if status != server_status or int(data_id) != int(server_data_id):
Expand Down Expand Up @@ -832,7 +832,7 @@ def edit_dataset(
result_xml = openml._api_calls._perform_api_call(
"data/edit", "post", data=form_data, file_elements=file_elements
)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
data_id = result["oml:data_edit"]["oml:id"]
return int(data_id)

Expand Down Expand Up @@ -871,7 +871,7 @@ def fork_dataset(data_id: int) -> int:
# compose data fork parameters
form_data = {"data_id": data_id}
result_xml = openml._api_calls._perform_api_call("data/fork", "post", data=form_data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
data_id = result["oml:data_fork"]["oml:id"]
return int(data_id)

Expand All @@ -891,7 +891,7 @@ def _topic_add_dataset(data_id: int, topic: str):
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
form_data = {"data_id": data_id, "topic": topic}
result_xml = openml._api_calls._perform_api_call("data/topicadd", "post", data=form_data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
data_id = result["oml:data_topic"]["oml:id"]
return int(data_id)

Expand All @@ -912,7 +912,7 @@ def _topic_delete_dataset(data_id: int, topic: str):
raise TypeError("`data_id` must be of type `int`, not {}.".format(type(data_id)))
form_data = {"data_id": data_id, "topic": topic}
result_xml = openml._api_calls._perform_api_call("data/topicdelete", "post", data=form_data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
data_id = result["oml:data_topic"]["oml:id"]
return int(data_id)

Expand Down Expand Up @@ -951,7 +951,7 @@ def _get_dataset_description(did_cache_dir, dataset_id):
with io.open(description_file, "w", encoding="utf8") as fh:
fh.write(dataset_xml)

description = xmltodict.parse(dataset_xml)["oml:data_set_description"]
description = xmltodict.parse(dataset_xml, strip_whitespace=False)["oml:data_set_description"]

return description

Expand Down Expand Up @@ -1205,7 +1205,7 @@ def _get_online_dataset_arff(dataset_id):
# build a dict from the xml.
# use the url from the dataset description and return the ARFF string
return openml._api_calls._download_text_file(
xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:url"],
xmltodict.parse(dataset_xml, strip_whitespace=False)["oml:data_set_description"]["oml:url"],
)


Expand All @@ -1225,4 +1225,6 @@ def _get_online_dataset_format(dataset_id):
"""
dataset_xml = openml._api_calls._perform_api_call("data/%d" % dataset_id, "get")
# build a dict from the xml and get the format from the dataset description
return xmltodict.parse(dataset_xml)["oml:data_set_description"]["oml:format"].lower()
return xmltodict.parse(dataset_xml, strip_whitespace=False)["oml:data_set_description"][
"oml:format"
].lower()
4 changes: 2 additions & 2 deletions openml/evaluations/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def _list_evaluations(
def __list_evaluations(api_call, output_format="object"):
"""Helper function to parse API calls which are lists of runs"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
evals_dict = xmltodict.parse(xml_string, force_list=("oml:evaluation",))
evals_dict = xmltodict.parse(xml_string, strip_whitespace=False, force_list=("oml:evaluation",))
# Minimalistic check if the XML is useful
if "oml:evaluations" not in evals_dict:
raise ValueError(
Expand Down Expand Up @@ -265,7 +265,7 @@ def list_evaluation_measures() -> List[str]:
"""
api_call = "evaluationmeasure/list"
xml_string = openml._api_calls._perform_api_call(api_call, "get")
qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
qualities = xmltodict.parse(xml_string, strip_whitespace=False, force_list=("oml:measures"))
# Minimalistic check if the XML is useful
if "oml:evaluation_measures" not in qualities:
raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"')
Expand Down
2 changes: 1 addition & 1 deletion openml/flows/flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ def to_filesystem(self, output_directory: str) -> None:
def from_filesystem(cls, input_directory) -> "OpenMLFlow":
with open(os.path.join(input_directory, "flow.xml"), "r") as f:
xml_string = f.read()
return OpenMLFlow._from_dict(xmltodict.parse(xml_string))
return OpenMLFlow._from_dict(xmltodict.parse(xml_string, strip_whitespace=False))

def _parse_publish_response(self, xml_response: Dict):
""" Parse the id from the xml_response and assign it to self. """
Expand Down
6 changes: 3 additions & 3 deletions openml/flows/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def flow_exists(name: str, external_version: str) -> Union[int, bool]:
"flow/exists", "post", data={"name": name, "external_version": external_version},
)

result_dict = xmltodict.parse(xml_response)
result_dict = xmltodict.parse(xml_response, strip_whitespace=False)
flow_id = int(result_dict["oml:flow_exists"]["oml:id"])
if flow_id > 0:
return flow_id
Expand Down Expand Up @@ -329,7 +329,7 @@ def get_flow_id(
def __list_flows(api_call: str, output_format: str = "dict") -> Union[Dict, pd.DataFrame]:

xml_string = openml._api_calls._perform_api_call(api_call, "get")
flows_dict = xmltodict.parse(xml_string, force_list=("oml:flow",))
flows_dict = xmltodict.parse(xml_string, strip_whitespace=False, force_list=("oml:flow",))

# Minimalistic check if the XML is useful
assert type(flows_dict["oml:flows"]["oml:flow"]) == list, type(flows_dict["oml:flows"])
Expand Down Expand Up @@ -538,4 +538,4 @@ def _create_flow_from_xml(flow_xml: str) -> OpenMLFlow:
OpenMLFlow
"""

return OpenMLFlow._from_dict(xmltodict.parse(flow_xml))
return OpenMLFlow._from_dict(xmltodict.parse(flow_xml, strip_whitespace=False))
10 changes: 6 additions & 4 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,9 +768,11 @@ def obtain_field(xml_obj, fieldname, from_server, cast=None):
else:
raise AttributeError("Run XML does not contain required (server) " "field: ", fieldname)

run = xmltodict.parse(xml, force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"])[
"oml:run"
]
run = xmltodict.parse(
xml,
strip_whitespace=False,
force_list=["oml:file", "oml:evaluation", "oml:parameter_setting"],
)["oml:run"]
run_id = obtain_field(run, "oml:run_id", from_server, cast=int)
uploader = obtain_field(run, "oml:uploader", from_server, cast=int)
uploader_name = obtain_field(run, "oml:uploader_name", from_server)
Expand Down Expand Up @@ -1096,7 +1098,7 @@ def _list_runs(
def __list_runs(api_call, output_format="dict"):
"""Helper function to parse API calls which are lists of runs"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
runs_dict = xmltodict.parse(xml_string, force_list=("oml:run",))
runs_dict = xmltodict.parse(xml_string, strip_whitespace=False, force_list=("oml:run",))
# Minimalistic check if the XML is useful
if "oml:runs" not in runs_dict:
raise ValueError('Error in return XML, does not contain "oml:runs": %s' % str(runs_dict))
Expand Down
4 changes: 3 additions & 1 deletion openml/runs/trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,9 @@ def trace_from_xml(cls, xml):
Object containing the run id and a dict containing the trace
iterations.
"""
result_dict = xmltodict.parse(xml, force_list=("oml:trace_iteration",))["oml:trace"]
result_dict = xmltodict.parse(
xml, strip_whitespace=False, force_list=("oml:trace_iteration",)
)["oml:trace"]

run_id = result_dict["oml:run_id"]
trace = OrderedDict()
Expand Down
8 changes: 4 additions & 4 deletions openml/setups/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def setup_exists(flow) -> int:
result = openml._api_calls._perform_api_call(
"/setup/exists/", "post", file_elements=file_elements
)
result_dict = xmltodict.parse(result)
result_dict = xmltodict.parse(result, strip_whitespace=False)
setup_id = int(result_dict["oml:setup_exists"]["oml:id"])
if setup_id > 0:
return setup_id
Expand All @@ -65,7 +65,7 @@ def _get_cached_setup(setup_id):
try:
setup_file = os.path.join(setup_cache_dir, "description.xml")
with io.open(setup_file, encoding="utf8") as fh:
setup_xml = xmltodict.parse(fh.read())
setup_xml = xmltodict.parse(fh.read(), strip_whitespace=False)
setup = _create_setup_from_xml(setup_xml, output_format="object")
return setup

Expand Down Expand Up @@ -103,7 +103,7 @@ def get_setup(setup_id):
with io.open(setup_file, "w", encoding="utf8") as fh:
fh.write(setup_xml)

result_dict = xmltodict.parse(setup_xml)
result_dict = xmltodict.parse(setup_xml, strip_whitespace=False)
return _create_setup_from_xml(result_dict, output_format="object")


Expand Down Expand Up @@ -190,7 +190,7 @@ def _list_setups(setup=None, output_format="object", **kwargs):
def __list_setups(api_call, output_format="object"):
"""Helper function to parse API calls which are lists of setups"""
xml_string = openml._api_calls._perform_api_call(api_call, "get")
setups_dict = xmltodict.parse(xml_string, force_list=("oml:setup",))
setups_dict = xmltodict.parse(xml_string, strip_whitespace=False, force_list=("oml:setup",))
openml_uri = "http://openml.org/openml"
# Minimalistic check if the XML is useful
if "oml:setups" not in setups_dict:
Expand Down
12 changes: 7 additions & 5 deletions openml/study/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,9 @@ def _get_study(id_: Union[int, str], entity_type) -> BaseStudy:
"oml:run_id",
"oml:tag", # legacy.
)
result_dict = xmltodict.parse(xml_string, force_list=force_list_tags)["oml:study"]
result_dict = xmltodict.parse(xml_string, strip_whitespace=False, force_list=force_list_tags)[
"oml:study"
]
study_id = int(result_dict["oml:id"])
alias = result_dict["oml:alias"] if "oml:alias" in result_dict else None
main_entity_type = result_dict["oml:main_entity_type"]
Expand Down Expand Up @@ -274,7 +276,7 @@ def update_study_status(study_id: int, status: str) -> None:
raise ValueError("Illegal status value. " "Legal values: %s" % legal_status)
data = {"study_id": study_id, "status": status}
result_xml = openml._api_calls._perform_api_call("study/status/update", "post", data=data)
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
server_study_id = result["oml:study_status_update"]["oml:id"]
server_status = result["oml:study_status_update"]["oml:status"]
if status != server_status or int(study_id) != int(server_study_id):
Expand Down Expand Up @@ -354,7 +356,7 @@ def attach_to_study(study_id: int, run_ids: List[int]) -> int:
uri = "study/%d/attach" % study_id
post_variables = {"ids": ",".join(str(x) for x in run_ids)}
result_xml = openml._api_calls._perform_api_call(uri, "post", post_variables)
result = xmltodict.parse(result_xml)["oml:study_attach"]
result = xmltodict.parse(result_xml, strip_whitespace=False)["oml:study_attach"]
return int(result["oml:linked_entities"])


Expand Down Expand Up @@ -397,7 +399,7 @@ def detach_from_study(study_id: int, run_ids: List[int]) -> int:
uri = "study/%d/detach" % study_id
post_variables = {"ids": ",".join(str(x) for x in run_ids)}
result_xml = openml._api_calls._perform_api_call(uri, "post", post_variables)
result = xmltodict.parse(result_xml)["oml:study_detach"]
result = xmltodict.parse(result_xml, strip_whitespace=False)["oml:study_detach"]
return int(result["oml:linked_entities"])


Expand Down Expand Up @@ -567,7 +569,7 @@ def _list_studies(output_format="dict", **kwargs) -> Union[Dict, pd.DataFrame]:

def __list_studies(api_call, output_format="object") -> Union[Dict, pd.DataFrame]:
xml_string = openml._api_calls._perform_api_call(api_call, "get")
study_dict = xmltodict.parse(xml_string, force_list=("oml:study",))
study_dict = xmltodict.parse(xml_string, strip_whitespace=False, force_list=("oml:study",))

# Minimalistic check if the XML is useful
assert type(study_dict["oml:study_list"]["oml:study"]) == list, type(
Expand Down
8 changes: 5 additions & 3 deletions openml/tasks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def _get_estimation_procedure_list():
url_suffix = "estimationprocedure/list"
xml_string = openml._api_calls._perform_api_call(url_suffix, "get")

procs_dict = xmltodict.parse(xml_string)
procs_dict = xmltodict.parse(xml_string, strip_whitespace=False)
# Minimalistic check if the XML is useful
if "oml:estimationprocedures" not in procs_dict:
raise ValueError("Error in return XML, does not contain tag " "oml:estimationprocedures.")
Expand Down Expand Up @@ -232,7 +232,9 @@ def _list_tasks(task_type=None, output_format="dict", **kwargs):

def __list_tasks(api_call, output_format="dict"):
xml_string = openml._api_calls._perform_api_call(api_call, "get")
tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
tasks_dict = xmltodict.parse(
xml_string, strip_whitespace=False, force_list=("oml:task", "oml:input")
)
# Minimalistic check if the XML is useful
if "oml:tasks" not in tasks_dict:
raise ValueError('Error in return XML, does not contain "oml:runs": %s' % str(tasks_dict))
Expand Down Expand Up @@ -405,7 +407,7 @@ def _create_task_from_xml(xml):
-------
OpenMLTask
"""
dic = xmltodict.parse(xml)["oml:task"]
dic = xmltodict.parse(xml, strip_whitespace=False)["oml:task"]
estimation_parameters = dict()
inputs = dict()
# Due to the unordered structure we obtain, we first have to extract
Expand Down
4 changes: 2 additions & 2 deletions openml/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _tag_entity(entity_type, entity_id, tag, untag=False):
post_variables = {"%s_id" % entity_type: entity_id, "tag": tag}
result_xml = openml._api_calls._perform_api_call(uri, "post", post_variables)

result = xmltodict.parse(result_xml, force_list={"oml:tag"})[main_tag]
result = xmltodict.parse(result_xml, strip_whitespace=False, force_list={"oml:tag"})[main_tag]

if "oml:tag" in result:
return result["oml:tag"]
Expand Down Expand Up @@ -173,7 +173,7 @@ def _delete_entity(entity_type, entity_id):

url_suffix = "%s/%d" % (entity_type, entity_id)
result_xml = openml._api_calls._perform_api_call(url_suffix, "delete")
result = xmltodict.parse(result_xml)
result = xmltodict.parse(result_xml, strip_whitespace=False)
if "oml:%s_delete" % entity_type in result:
return True
else:
Expand Down