From 67952145a496746c5fa92e425c8eafe468e2fcd4 Mon Sep 17 00:00:00 2001 From: Chang Liu Date: Sun, 29 Jan 2023 22:32:08 -0500 Subject: [PATCH] Return unconverted properties for PyG converter --- .../torch_geometric_result_converter.py | 69 +++++--- tools/python_api/test/test_torch_geometric.py | 166 +++++++++++++++--- 2 files changed, 184 insertions(+), 51 deletions(-) diff --git a/tools/python_api/src_py/torch_geometric_result_converter.py b/tools/python_api/src_py/torch_geometric_result_converter.py index 4737e3f843..c728c274a9 100644 --- a/tools/python_api/src_py/torch_geometric_result_converter.py +++ b/tools/python_api/src_py/torch_geometric_result_converter.py @@ -13,7 +13,7 @@ def __init__(self, query_result): self.internal_id_to_pos_dict = {} self.pos_to_primary_key_dict = {} self.warning_messages = set() - self.ignored_properties = set() + self.unconverted_properties = {} self.properties_to_extract = self.query_result._get_properties_to_extract() def __get_node_property_names(self, table_name): @@ -89,8 +89,9 @@ def __populate_nodes_dict_and_deduplicte_edges(self): def __extract_properties_from_node(self, node, label, node_property_names): import torch for prop_name in node_property_names: - # Ignore properties that are marked as ignored - if (label, prop_name) in self.ignored_properties: + # If property is already marked as unconverted, then add it directly without further checks + if label in self.unconverted_properties and prop_name in self.unconverted_properties[label]: + self.__add_unconverted_property(node, label, prop_name) continue # Read primary key but do not add it to the node properties @@ -98,18 +99,20 @@ def __extract_properties_from_node(self, node, label, node_property_names): primary_key = node[prop_name] continue - # Ignore properties that are not supported by torch_geometric + # Mark properties that are not supported by torch_geometric as unconverted if node_property_names[prop_name]["type"] not in [Type.INT64.value, Type.DOUBLE.value, Type.BOOL.value]: self.warning_messages.add( - "Property {}.{} of type {} is not supported by torch_geometric. The property is ignored." + "Property {}.{} of type {} is not supported by torch_geometric. The property is marked as unconverted." .format(label, prop_name, node_property_names[prop_name]["type"])) - self.__ignore_property(label, prop_name) + self.__mark_property_unconverted(label, prop_name) + self.__add_unconverted_property(node, label, prop_name) continue if node[prop_name] is None: self.warning_messages.add( - "Property {}.{} has a null value. torch_geometric does not support null values. The property is ignored." + "Property {}.{} has a null value. torch_geometric does not support null values. The property is marked as unconverted." .format(label, prop_name)) - self.__ignore_property(label, prop_name) + self.__mark_property_unconverted(label, prop_name) + self.__add_unconverted_property(node, label, prop_name) continue if node_property_names[prop_name]['dimension'] == 0: @@ -124,18 +127,20 @@ def __extract_properties_from_node(self, node, label, node_property_names): curr_value = torch.BoolTensor(node[prop_name]) except ValueError: self.warning_messages.add( - "Property {}.{} cannot be converted to Tensor (likely due to nested list of variable length). The property is ignored." + "Property {}.{} cannot be converted to Tensor (likely due to nested list of variable length). The property is marked as unconverted." .format(label, prop_name)) - self.__ignore_property(label, prop_name) + self.__mark_property_unconverted(label, prop_name) + self.__add_unconverted_property(node, label, prop_name) continue # Check if the shape of the property is consistent if label in self.nodes_dict and prop_name in self.nodes_dict[label]: - # If the shape is inconsistent, then ignore the property + # If the shape is inconsistent, then mark the property as unconverted if curr_value.shape != self.nodes_dict[label][prop_name][0].shape: self.warning_messages.add( - "Property {}.{} has an inconsistent shape. The property is ignored." + "Property {}.{} has an inconsistent shape. The property is marked as unconverted." .format(label, prop_name)) - self.__ignore_property(label, prop_name) + self.__mark_property_unconverted(label, prop_name) + self.__add_unconverted_property(node, label, prop_name) continue # Create the dictionary for the label if it does not exist @@ -152,12 +157,27 @@ def __extract_properties_from_node(self, node, label, node_property_names): pos = len(self.nodes_dict[label][prop_name]) - 1 return pos, primary_key - def __ignore_property(self, label, prop_name): - self.ignored_properties.add((label, prop_name)) - if label in self.nodes_dict and prop_name in self.nodes_dict[label]: - del self.nodes_dict[label][prop_name] - if len(self.nodes_dict[label]) == 0: - del self.nodes_dict[label] + def __add_unconverted_property(self, node, label, prop_name): + self.unconverted_properties[label][prop_name].append( + node[prop_name]) + + def __mark_property_unconverted(self, label, prop_name): + import torch + if label not in self.unconverted_properties: + self.unconverted_properties[label] = {} + if prop_name not in self.unconverted_properties[label]: + if label in self.nodes_dict and prop_name in self.nodes_dict[label]: + self.unconverted_properties[label][prop_name] = self.nodes_dict[label][prop_name] + del self.nodes_dict[label][prop_name] + if len(self.nodes_dict[label]) == 0: + del self.nodes_dict[label] + for i in range(len(self.unconverted_properties[label][prop_name])): + # If the property is a tensor, convert it back to list (consistent with the original type) + if torch.is_tensor(self.unconverted_properties[label][prop_name][i]): + self.unconverted_properties[label][prop_name][i] = self.unconverted_properties[label][prop_name][i].tolist( + ) + else: + self.unconverted_properties[label][prop_name] = [] def __populate_edges_dict(self): # Post-process edges, map internal ids to positions @@ -181,7 +201,7 @@ def __convert_to_torch_geometric(self): import torch_geometric if len(self.nodes_dict) == 0: self.warning_messages.add( - "No nodes found or all nodes were ignored. Returning None.") + "No nodes found or all node properties are not converted. Returning None.") return None # If there is only one node type, then convert to torch_geometric.data.Data @@ -227,11 +247,14 @@ def __convert_to_torch_geometric(self): data.edge_index = edge_idx pos_to_primary_key_dict = self.pos_to_primary_key_dict[ label] if not is_hetero else self.pos_to_primary_key_dict - return data, pos_to_primary_key_dict + + unconverted_properties = self.unconverted_properties if is_hetero else self.unconverted_properties[next( + iter(self.unconverted_properties))] + return data, pos_to_primary_key_dict, unconverted_properties def get_as_torch_geometric(self): self.__populate_nodes_dict_and_deduplicte_edges() self.__populate_edges_dict() - data, pos_to_primary_key_dict = self.__convert_to_torch_geometric() + data, pos_to_primary_key_dict, unconverted_properties = self.__convert_to_torch_geometric() self.__print_warnings() - return data, pos_to_primary_key_dict + return data, pos_to_primary_key_dict, unconverted_properties diff --git a/tools/python_api/test/test_torch_geometric.py b/tools/python_api/test/test_torch_geometric.py index 3922a0c879..243bf81ac6 100644 --- a/tools/python_api/test/test_torch_geometric.py +++ b/tools/python_api/test/test_torch_geometric.py @@ -208,15 +208,15 @@ def test_to_torch_geometric_nodes_only(establish_connection): res = conn.execute(query) with warnings.catch_warnings(record=True) as ws: - torch_geometric_data, pos_to_idx = res.get_as_torch_geometric() + torch_geometric_data, pos_to_idx, unconverted_properties = res.get_as_torch_geometric() warnings_ground_truth = set([ - "Property person.courseScoresPerTerm cannot be converted to Tensor (likely due to nested list of variable length). The property is ignored.", - "Property person.lastJobDuration of type INTERVAL is not supported by torch_geometric. The property is ignored.", - "Property person.registerTime of type TIMESTAMP is not supported by torch_geometric. The property is ignored.", - "Property person.birthdate of type DATE is not supported by torch_geometric. The property is ignored.", - "Property person.fName of type STRING is not supported by torch_geometric. The property is ignored.", - "Property person.workedHours has an inconsistent shape. The property is ignored.", - "Property person.usedNames of type STRING is not supported by torch_geometric. The property is ignored.", + "Property person.courseScoresPerTerm cannot be converted to Tensor (likely due to nested list of variable length). The property is marked as unconverted.", + "Property person.lastJobDuration of type INTERVAL is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.registerTime of type TIMESTAMP is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.birthdate of type DATE is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.fName of type STRING is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.workedHours has an inconsistent shape. The property is marked as unconverted.", + "Property person.usedNames of type STRING is not supported by torch_geometric. The property is marked as unconverted.", ]) assert len(ws) == 7 for w in ws: @@ -252,6 +252,36 @@ def test_to_torch_geometric_nodes_only(establish_connection): assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i]]['eyeSight'] - \ torch_geometric_data.eyeSight[i].item() < 1e-6 + assert len(unconverted_properties) == 7 + assert 'courseScoresPerTerm' in unconverted_properties + for i in range(8): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['courseScoresPerTerm'] == unconverted_properties['courseScoresPerTerm'][i] + assert 'lastJobDuration' in unconverted_properties + for i in range(8): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['lastJobDuration'] == unconverted_properties['lastJobDuration'][i] + assert 'registerTime' in unconverted_properties + for i in range(8): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['registerTime'] == unconverted_properties['registerTime'][i] + assert 'birthdate' in unconverted_properties + for i in range(8): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['birthdate'] == unconverted_properties['birthdate'][i] + assert 'fName' in unconverted_properties + for i in range(8): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['fName'] == unconverted_properties['fName'][i] + assert 'usedNames' in unconverted_properties + for i in range(8): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['usedNames'] == unconverted_properties['usedNames'][i] + + assert 'workedHours' in unconverted_properties + for i in range(8): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['workedHours'] == unconverted_properties['workedHours'][i] def test_to_torch_geometric_homogeneous_graph(establish_connection): conn, _ = establish_connection @@ -259,15 +289,15 @@ def test_to_torch_geometric_homogeneous_graph(establish_connection): res = conn.execute(query) with warnings.catch_warnings(record=True) as ws: - torch_geometric_data, pos_to_idx = res.get_as_torch_geometric() + torch_geometric_data, pos_to_idx, unconverted_properties = res.get_as_torch_geometric() warnings_ground_truth = set([ - "Property person.courseScoresPerTerm cannot be converted to Tensor (likely due to nested list of variable length). The property is ignored.", - "Property person.lastJobDuration of type INTERVAL is not supported by torch_geometric. The property is ignored.", - "Property person.registerTime of type TIMESTAMP is not supported by torch_geometric. The property is ignored.", - "Property person.birthdate of type DATE is not supported by torch_geometric. The property is ignored.", - "Property person.fName of type STRING is not supported by torch_geometric. The property is ignored.", - "Property person.workedHours has an inconsistent shape. The property is ignored.", - "Property person.usedNames of type STRING is not supported by torch_geometric. The property is ignored.", + "Property person.courseScoresPerTerm cannot be converted to Tensor (likely due to nested list of variable length). The property is marked as unconverted.", + "Property person.lastJobDuration of type INTERVAL is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.registerTime of type TIMESTAMP is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.birthdate of type DATE is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.fName of type STRING is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.workedHours has an inconsistent shape. The property is marked as unconverted.", + "Property person.usedNames of type STRING is not supported by torch_geometric. The property is marked as unconverted.", ]) assert len(ws) == 7 for w in ws: @@ -303,6 +333,37 @@ def test_to_torch_geometric_homogeneous_graph(establish_connection): assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i]]['eyeSight'] - \ torch_geometric_data.eyeSight[i].item() < 1e-6 + assert len(unconverted_properties) == 7 + assert 'courseScoresPerTerm' in unconverted_properties + for i in range(7): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['courseScoresPerTerm'] == unconverted_properties['courseScoresPerTerm'][i] + assert 'lastJobDuration' in unconverted_properties + for i in range(7): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['lastJobDuration'] == unconverted_properties['lastJobDuration'][i] + assert 'registerTime' in unconverted_properties + for i in range(7): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['registerTime'] == unconverted_properties['registerTime'][i] + assert 'birthdate' in unconverted_properties + for i in range(7): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['birthdate'] == unconverted_properties['birthdate'][i] + assert 'fName' in unconverted_properties + for i in range(7): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['fName'] == unconverted_properties['fName'][i] + assert 'usedNames' in unconverted_properties + for i in range(7): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['usedNames'] == unconverted_properties['usedNames'][i] + + assert 'workedHours' in unconverted_properties + for i in range(7): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx[i] + ]['workedHours'] == unconverted_properties['workedHours'][i] + assert torch_geometric_data.edge_index.shape == torch.Size([2, 14]) for i in range(14): src, dst = torch_geometric_data.edge_index[0][i].item( @@ -319,19 +380,19 @@ def test_to_torch_geometric_heterogeneous_graph(establish_connection): res = conn.execute(query) with warnings.catch_warnings(record=True) as ws: - torch_geometric_data, pos_to_idx = res.get_as_torch_geometric() + torch_geometric_data, pos_to_idx, unconverted_properties = res.get_as_torch_geometric() assert len(ws) == 9 warnings_ground_truth = set([ - "Property organisation.name of type STRING is not supported by torch_geometric. The property is ignored.", - "Property person.courseScoresPerTerm cannot be converted to Tensor (likely due to nested list of variable length). The property is ignored.", - "Property person.lastJobDuration of type INTERVAL is not supported by torch_geometric. The property is ignored.", - "Property person.registerTime of type TIMESTAMP is not supported by torch_geometric. The property is ignored.", - "Property person.birthdate of type DATE is not supported by torch_geometric. The property is ignored.", - "Property person.fName of type STRING is not supported by torch_geometric. The property is ignored.", - "Property organisation.history of type STRING is not supported by torch_geometric. The property is ignored.", - "Property person.usedNames of type STRING is not supported by torch_geometric. The property is ignored.", - "Property organisation.licenseValidInterval of type INTERVAL is not supported by torch_geometric. The property is ignored.", + "Property organisation.name of type STRING is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.courseScoresPerTerm cannot be converted to Tensor (likely due to nested list of variable length). The property is marked as unconverted.", + "Property person.lastJobDuration of type INTERVAL is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.registerTime of type TIMESTAMP is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.birthdate of type DATE is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.fName of type STRING is not supported by torch_geometric. The property is marked as unconverted.", + "Property organisation.history of type STRING is not supported by torch_geometric. The property is marked as unconverted.", + "Property person.usedNames of type STRING is not supported by torch_geometric. The property is marked as unconverted.", + "Property organisation.licenseValidInterval of type INTERVAL is not supported by torch_geometric. The property is marked as unconverted.", ]) for w in ws: @@ -367,6 +428,33 @@ def test_to_torch_geometric_heterogeneous_graph(establish_connection): assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx['person'][i]]['eyeSight'] - \ torch_geometric_data['person'].eyeSight[i].item() < 1e-6 + assert 'person' in unconverted_properties + assert len(unconverted_properties['person']) == 6 + assert 'courseScoresPerTerm' in unconverted_properties['person'] + for i in range(4): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx['person'][i] + ]['courseScoresPerTerm'] == unconverted_properties['person']['courseScoresPerTerm'][i] + assert 'lastJobDuration' in unconverted_properties['person'] + for i in range(4): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx['person'][i] + ]['lastJobDuration'] == unconverted_properties['person']['lastJobDuration'][i] + assert 'registerTime' in unconverted_properties['person'] + for i in range(4): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx['person'][i] + ]['registerTime'] == unconverted_properties['person']['registerTime'][i] + assert 'birthdate' in unconverted_properties['person'] + for i in range(4): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx['person'][i] + ]['birthdate'] == unconverted_properties['person']['birthdate'][i] + assert 'fName' in unconverted_properties['person'] + for i in range(4): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx['person'][i] + ]['fName'] == unconverted_properties['person']['fName'][i] + assert 'usedNames' in unconverted_properties['person'] + for i in range(4): + assert TINY_SNB_PERSONS_GROUND_TRUTH[pos_to_idx['person'][i] + ]['usedNames'] == unconverted_properties['person']['usedNames'][i] + assert torch_geometric_data['person', 'person'].edge_index.shape == torch.Size([ 2, 6]) for i in range(3): @@ -402,6 +490,23 @@ def test_to_torch_geometric_heterogeneous_graph(establish_connection): assert TINY_SNB_ORGANISATIONS_GROUND_TRUTH[pos_to_idx['organisation'][i] ]['rating'] - torch_geometric_data['organisation'].rating[i].item() < 1e-6 + assert 'organisation' in unconverted_properties + assert len(unconverted_properties['organisation']) == 3 + assert 'name' in unconverted_properties['organisation'] + for i in range(2): + assert TINY_SNB_ORGANISATIONS_GROUND_TRUTH[pos_to_idx['organisation'][i] + ]['name'] == unconverted_properties['organisation']['name'][i] + + assert 'history' in unconverted_properties['organisation'] + for i in range(2): + assert TINY_SNB_ORGANISATIONS_GROUND_TRUTH[pos_to_idx['organisation'][i] + ]['history'] == unconverted_properties['organisation']['history'][i] + + assert 'licenseValidInterval' in unconverted_properties['organisation'] + for i in range(2): + assert TINY_SNB_ORGANISATIONS_GROUND_TRUTH[pos_to_idx['organisation'][i] + ]['licenseValidInterval'] == unconverted_properties['organisation']['licenseValidInterval'][i] + assert torch_geometric_data['person', 'organisation'].edge_index.shape == torch.Size([ 2, 2]) for i in range(2): @@ -419,9 +524,9 @@ def test_to_torch_geometric_multi_dimensonal_lists(establish_connection): res = conn.execute(query) with warnings.catch_warnings(record=True) as ws: - torch_geometric_data, pos_to_idx = res.get_as_torch_geometric() + torch_geometric_data, pos_to_idx, unconverted_properties = res.get_as_torch_geometric() assert len(ws) == 1 - assert str(ws[0].message) == "Property tensor.oneDimInt has a null value. torch_geometric does not support null values. The property is ignored." + assert str(ws[0].message) == "Property tensor.oneDimInt has a null value. torch_geometric does not support null values. The property is marked as unconverted." bool_list = [] float_list = [] @@ -448,3 +553,8 @@ def test_to_torch_geometric_multi_dimensonal_lists(establish_connection): assert torch_geometric_data.intTensor.shape == int_tensor.shape assert torch_geometric_data.intTensor.dtype == int_tensor.dtype assert torch.all(torch_geometric_data.intTensor == int_tensor) + + assert len(unconverted_properties) == 1 + assert "oneDimInt" in unconverted_properties + assert len(unconverted_properties["oneDimInt"]) == 6 + assert unconverted_properties["oneDimInt"] == [1, 2, None, None, 5, 6]