Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[0.2.0] Metadata Implementation #81

Merged
merged 37 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
de083b8
Update file structure
MooooCat Dec 19, 2023
7834da0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 19, 2023
dd87ee1
Update single_table.py
MooooCat Dec 20, 2023
14e83e6
Merge branch 'main' into feature-metadata
MooooCat Dec 21, 2023
06a14f2
Update metadata (still draft)
MooooCat Dec 21, 2023
ac89e8e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 21, 2023
1c322d0
Update metadata and reset file structure
MooooCat Dec 23, 2023
462b3cb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 23, 2023
fb1565c
add numeric inspector
MooooCat Dec 23, 2023
f6da1d3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 23, 2023
7e65c83
fix metadata initialization
MooooCat Dec 23, 2023
329755b
fix type hits error in py38
MooooCat Dec 23, 2023
f2d50f5
add inspectors
MooooCat Dec 23, 2023
1f4ff08
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 23, 2023
10f0175
Update relationship.py
MooooCat Dec 24, 2023
c027feb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 24, 2023
9b026fc
Update multi-table-combiner
MooooCat Dec 25, 2023
c768a24
add composite key list in relationship
MooooCat Dec 25, 2023
8af1e1d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 25, 2023
ed30dad
Apply suggestions from code review
MooooCat Dec 25, 2023
f2e4954
Apply suggestions from code review
MooooCat Dec 25, 2023
44d0c61
use a single list for primary key(s)
MooooCat Dec 25, 2023
75985e0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 25, 2023
cfd2a11
Sync with main.
MooooCat Dec 25, 2023
644568f
Update expections
MooooCat Dec 25, 2023
c1b239c
Update check functions
MooooCat Dec 25, 2023
d6077d6
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 25, 2023
89e4d1f
Apply suggestions from code review
MooooCat Dec 25, 2023
143eeb9
Apply suggestions from code review
MooooCat Dec 25, 2023
49e02c4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 25, 2023
b2889da
update some test case
MooooCat Dec 26, 2023
8faa6e2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 26, 2023
4cf8db0
update test cases
MooooCat Dec 27, 2023
98f4596
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 27, 2023
b50e41b
Merge branch 'main' into feature-metadata
MooooCat Dec 27, 2023
7687f77
update metadata save and load test cases.
MooooCat Dec 27, 2023
ad312c8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Dec 27, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 69 additions & 37 deletions sdgx/data_models/metadata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import json
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List

Expand All @@ -10,7 +9,7 @@

from sdgx.data_loader import DataLoader
from sdgx.data_models.inspectors.manager import InspectorManager
from sdgx.exceptions import MetadataInitError
from sdgx.exceptions import MetadaError, MetadataInitError
from sdgx.utils import logger


Expand All @@ -22,13 +21,9 @@ class Metadata(BaseModel):
For each column, there should be an instance of the Data Type object.

Args:
primary_key(str): The primary key, a field used to uniquely identify each row in the table.
primary_keys(List[str]): The primary key, a field used to uniquely identify each row in the table.
The primary key of each row must be unique and not empty.

composite_primary_key(bool): Whether to enable the composite primary key feature.

primary_key_list(bool): List of composite primary keys.

column_list(list[str]): list of the comlumn name in the table, other columns lists are used to store column information.
"""

Expand All @@ -39,6 +34,7 @@ class Metadata(BaseModel):
# variables related to columns
# column_list is used to store all columns' name
column_list: List[str] = []

# other columns lists are used to store column information
# here are 5 basic data types
id_columns: List[str] = []
Expand All @@ -47,11 +43,9 @@ class Metadata(BaseModel):
discrete_columns: List[str] = []
datetime_columns: List[str] = []

# _column_dict = {}
_extend: Dict[str, Any] = {}

# version info
metadata_version: str = "1.0"
_extend: Dict[str, Any] = {}

def get(self, key: str, default=None) -> Any:
return getattr(self, key, getattr(self._extend, key, default))
Expand All @@ -76,7 +70,7 @@ def from_dataloader(
cls,
dataloader: DataLoader,
max_chunk: int = 10,
primary_key: str = None,
primary_keys: List[str] = None,
include_inspectors: list[str] | None = None,
exclude_inspectors: list[str] | None = None,
inspector_init_kwargs: dict[str, Any] | None = None,
Expand Down Expand Up @@ -107,11 +101,11 @@ def from_dataloader(
if all(i.ready for i in inspectors) or i > max_chunk:
break

# If primary_key is not specified, use the first column.
if primary_key is None:
primary_key = dataloader.columns()[0]
# If primary_key is not specified, use the first column (in list).
if primary_keys is None:
primary_keys = [dataloader.columns()[0]]

metadata = Metadata(primary_key=primary_key, column_list=dataloader.columns())
metadata = Metadata(primary_keys=primary_keys, column_list=dataloader.columns())
for inspector in inspectors:
metadata.update(inspector.inspect())

Expand All @@ -131,7 +125,7 @@ def from_dataframe(
for inspector in inspectors:
inspector.fit(df)

metadata = Metadata(primary_key=df.columns[0], column_list=list(df.columns))
metadata = Metadata(primary_keys=[df.columns[0]], column_list=list(df.columns))
for inspector in inspectors:
metadata.update(inspector.inspect())

Expand All @@ -147,41 +141,79 @@ def load(cls, path: str | Path) -> "Metadata":
attributes = json.load(path.open("r"))
return Metadata().update(attributes)

def check_single_primary_key(self, input_key: str):
"""Check whether a primary key in column_list and has ID data type.

Args:
input_key(str): the input primary_key str
"""

if input_key not in self.column_list:
raise MetadaError(f"Primary Key {input_key} not Exist in columns.")
if input_key not in self.id_columns:
raise MetadaError(f"Primary Key {input_key} should has ID DataType.")

def get_all_data_type_columns(self):
"""Get all column names from `self.xxx_columns`.

All Lists with the suffix _columns in model fields and extend fields need to be collected.
All defined column names will be counted.

Returns:
all_dtype_cols(set): set of all column names.
"""
all_dtype_cols = set()

# search the model fields and extend fields
for each_key in list(self.model_fields.keys()) + list(self._extend.keys()):
if each_key.endswith("_columns"):
column_names = self.get(each_key)
all_dtype_cols = all_dtype_cols.union(set(column_names))

return all_dtype_cols

def check(self):
MooooCat marked this conversation as resolved.
Show resolved Hide resolved
"""Checks column info.

When passing as input to the next module, perform necessary checks, including:
-Is the primary key correctly defined.
-Is there any missing definition of the column.
-Are there any unknown columns that have been incorrectly updated.
-Is the primary key correctly defined(in column list) and has ID data type.
-Is there any missing definition of each column in table.
-Are there any unknown columns that have been incorrectly updated.
"""
# Not implemented yet
# check primary key in column_list and has ID data type
for each_key in self.primary_keys:
self.check_single_primary_key(each_key)

pass
all_dtype_columns = self.get_all_data_type_columns()

def update_primary_key(self, primary_key: str | list[str], composite_primary_key: bool = False):
# check missing columns
for each_column in self.column_list:
if each_column not in all_dtype_columns:
raise MetadaError(f"Undefined data type for column {each_column}.")

# check unfamiliar columns in dtypes
for each_dtype_column in all_dtype_columns:
if each_dtype_column not in self.column_list:
raise MetadaError(f"Found undefined column: {each_dtype_column}.")

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can use set(self.column_list) - set(all_dtype_columns) here, which we can indicate all invalid keys once.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And maybe we can rename MetadaError to MetadataInvalidError

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can use set(self.column_list) - set(all_dtype_columns) here, which we can indicate all invalid keys once.

This is a more concise way of writing.

The current implementation is because the error information raised in these two cases is different, which may make it easier for users to locate the two error types.

Copy link
Collaborator

@Wh1isper Wh1isper Dec 25, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current implementation is because the error information raised in these two cases is different, which may make it easier for users to locate the two error types.

There is no conflict, just use two if-expressions instead of two for-ifs.

logger.info("Metadata check finished.")
MooooCat marked this conversation as resolved.
Show resolved Hide resolved

def update_primary_key(self, primary_keys: List[str]):
"""Update the primary key of the table

When update the primary key, the original primary key will be erased.

Args:
primary_key(str | list[str]): the primary key or key list.

composite_primary_key(bool): whether this table use composite primary key.
primary_keys(List[str]): the primary keys of this table.
"""

if composite_primary_key is False and not isinstance(primary_key, str):
raise ValueError("Primary key should be a string")
if not isinstance(primary_keys, List):
raise MetadaError("Primary key should be a list.")

if composite_primary_key is True and len(primary_key) == 0:
raise ValueError("Composite primary key list shoud NOT be empty.")
for each_key in primary_keys:
if each_key not in self.column_list:
raise MetadaError("Primary key not exist in table columns.")

if composite_primary_key is True:
self._composite_primary_key = True
self.primary_key = None
self.primary_key_list = primary_key
else:
self._composite_primary_key = False
self.primary_key = primary_key
self.primary_keys = primary_keys

logger.info(f"Primary Key updated: {primary_key}.")
logger.info(f"Primary Key updated: {primary_keys}.")
16 changes: 11 additions & 5 deletions sdgx/data_models/multi_table_combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from sdgx.data_models.metadata import Metadata
from sdgx.data_models.relationship import Relationship
from sdgx.exceptions import MultiTableCombinerError
from sdgx.utils import logger


class MultiTableCombiner(BaseModel):
Expand Down Expand Up @@ -31,7 +33,7 @@ def check(self):
relationship_cnt = len(self.relationships)
metadata_cnt = len(self.metadata_dict.keys())
if metadata_cnt != relationship_cnt + 1:
raise ValueError("Number of tables should corresponds to relationships.")
raise MultiTableCombinerError("Number of tables should corresponds to relationships.")

# table name check
table_names_from_relationships = set()
Expand All @@ -40,15 +42,19 @@ def check(self):
table_names = list(self.metadata_dict.keys())
for each_r in self.relationships:
if each_r.parent_table not in table_names:
raise ValueError(f"Metadata of parent table {each_r.parent_table} is missing.")
raise MultiTableCombinerError(
f"Metadata of parent table {each_r.parent_table} is missing."
)
if each_r.child_table not in table_names:
raise ValueError(f"Metadata of child table {each_r.child_table} is missing.")
raise MultiTableCombinerError(
f"Metadata of child table {each_r.child_table} is missing."
)
table_names_from_relationships.add(each_r.parent_table)
table_names_from_relationships.add(each_r.child_table)

# each table in metadata must in a relationship
for each_t in table_names:
if each_t not in table_names_from_relationships:
raise ValueError(f"Table {each_t} has not relationship.")
raise MultiTableCombinerError(f"Table {each_t} has not relationship.")

return True
logger.info("MultiTableCombiner check finished.")
11 changes: 4 additions & 7 deletions sdgx/data_models/relationship.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from pydantic import BaseModel

from sdgx.exceptions import RelationshipError


class Relationship(BaseModel):
"""Relationship between tables
Expand All @@ -18,15 +20,10 @@ class Relationship(BaseModel):
parent_table: str
child_table: str

# foreign keys
child_table_foreign_key: str = "foreign key undefined"

# for composite keys
composite_foreign_key: bool = False
child_table_composite_foreign_key: List[str] = []
foreign_keys: List[str] = []

def __init__(self, **kwargs):
super().__init__(**kwargs)

if self.parent_table == self.child_table:
raise ValueError("child table and parent table cannot be the same")
raise RelationshipError("child table and parent table cannot be the same")
21 changes: 21 additions & 0 deletions sdgx/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,24 @@ class CannotExportError(SdgxError):

EXIT_CODE = 107
ERROR_CODE = 8001


class DataModelError(SdgxError):
"""
Exception to indicate that exception in all data models.
"""

EXIT_CODE = 108
ERROR_CODE = 9001


class MetadaError(DataModelError):
ERROR_CODE = 9002


class RelationshipError(DataModelError):
ERROR_CODE = 9003


class MultiTableCombinerError(DataModelError):
ERROR_CODE = 9004