Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve the metadata to satisfy the need of HMA #165

Open
wants to merge 20 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions sdgx/data_models/combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ class MetadataCombiner(BaseModel):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if kwargs["relationships"]:
self.relationships = kwargs["relationships"]
else:
self.relationships = []
MooooCat marked this conversation as resolved.
Show resolved Hide resolved

def check(self):
"""Do necessary checks:
Expand Down
180 changes: 178 additions & 2 deletions sdgx/data_models/metadata.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from __future__ import annotations

import json
import warnings
from collections import defaultdict
from collections.abc import Iterable
from copy import deepcopy
from itertools import chain
from pathlib import Path
from typing import Any, Dict, Set
from typing import Any, Dict, List, Set

import pandas as pd
from pydantic import BaseModel
from pydantic import BaseModel, Field

from sdgx.data_loader import DataLoader
from sdgx.data_models.inspectors.base import RelationshipInspector
Expand All @@ -34,6 +36,27 @@ class Metadata(BaseModel):
column_list(list[str]): list of the comlumn name in the table, other columns lists are used to store column information.
"""

_SDTYPE_KWARGS = {
"numerical": frozenset(["computer_representation"]),
"datetime": frozenset(["datetime_format"]),
"categorical": frozenset(["order", "order_by"]),
"boolean": frozenset([]),
"id": frozenset(["regex_format"]),
"unknown": frozenset(["pii"]),
}

_KEYS = frozenset(
[
"columns",
"primary_key",
"alternate_keys",
"sequence_key",
"sequence_index",
"column_relationships",
"METADATA_SPEC_VERSION",
]
)

primary_keys: Set[str] = set()
"""
primary_keys is used to store single primary key or composite primary key
Expand Down Expand Up @@ -70,6 +93,159 @@ class Metadata(BaseModel):
"""
For extend information, use ``get`` and ``set``
"""
columns: Dict = defaultdict(str)
primary_key: str = "default"
alternate_keys: List[str] = Field(default_factory=list, optional=True)

def __init__(self, **data: Any):
super().__init__(**data)
self.columns = {}
self.primary_key = "default"
self.alternate_keys = []

# ----------------------------------------------------------------------
MooooCat marked this conversation as resolved.
Show resolved Hide resolved
def add_column(self, column_name, **kwargs):
"""Add a column to the ``SingleTableMetadata``.

Args:
column_name (str):
The column name to be added.
kwargs (type):
Any additional key word arguments for the column, where ``sdtype`` is required.

Raises:
- ``InvalidMetadataError`` if the column already exists.
- ``InvalidMetadataError`` if the ``kwargs`` do not contain ``sdtype``.
- ``InvalidMetadataError`` if the column has unexpected values or ``kwargs`` for the
given ``sdtype``.
- ``InvalidMetadataError`` if the ``pii`` value is not ``True`` or ``False`` when
present.
"""
if column_name in self.columns:
raise Exception(
f"Column name '{column_name}' already exists. Use 'update_column' "
"to update an existing column."
)

sdtype = kwargs.get("sdtype")
if sdtype is None:
raise Exception(f"Please provide a 'sdtype' for column '{column_name}'.")

column_kwargs = deepcopy(kwargs)
if sdtype not in self._SDTYPE_KWARGS:
pii = column_kwargs.get("pii", True)
column_kwargs["pii"] = pii

self.columns[column_name] = column_kwargs

def update_column(self, column_name, **kwargs):
"""Update an existing column in the ``SingleTableMetadata``.

Args:
column_name (str):
The column name to be updated.
**kwargs (type):
Any key word arguments that describe metadata for the column.

Raises:
- ``InvalidMetadataError`` if the column doesn't already exist in the
``SingleTableMetadata``.
- ``InvalidMetadataError`` if the column has unexpected values or ``kwargs`` for the
current
``sdtype``.
- ``InvalidMetadataError`` if the ``pii`` value is not ``True`` or ``False`` when
present.
"""
_kwargs = deepcopy(kwargs)
if "sdtype" in kwargs:
sdtype = kwargs.pop("sdtype")
else:
sdtype = self.columns[column_name]["sdtype"]
_kwargs["sdtype"] = sdtype

self.columns[column_name] = _kwargs

def set_primary_key(self, column_name):
"""Set the metadata primary key.

Args:
column_name (str):
Name of the primary key column(s).
"""
if column_name in self.alternate_keys:
warnings.warn(
f"'{column_name}' is currently set as an alternate key and will be removed from "
"that list."
)
self.alternate_keys.remove(column_name)

if self.primary_key is not None:
warnings.warn(
f"There is an existing primary key '{self.primary_key}'."
" This key will be removed."
)

self.primary_key = column_name

def remove_primary_key(self):
"""Remove the metadata primary key."""
if self.primary_key is None:
warnings.warn("No primary key exists to remove.")

self.primary_key = None

def add_column_relationship(self, relationship_type, column_names):
"""Add a column relationship to the metadata.

Args:
relationship_type (str):
Type of column relationship.
column_names (list[str]):
List of column names in the relationship.
"""
relationship = {"type": relationship_type, "column_names": column_names}
to_check = [relationship] + self.column_relationships
self.column_relationships.append(relationship)

def _get_primary_and_alternate_keys(self):
"""Get set of primary and alternate keys.

Returns:
set:
Set of keys.
"""
keys = set(self.alternate_keys)
if self.primary_key:
keys.update({self.primary_key})

return keys
Comment on lines +209 to +220
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you ever test this function? Not sure whether primary_key is a dict..If so, we may need to chang the name


@staticmethod
def _get_invalid_column_values(column, validation_function):
valid = column.apply(validation_function).astype(bool)

return set(column[~valid])

@classmethod
def load_from_dict(cls, metadata_dict):
"""Create a ``SingleTableMetadata`` instance from a python ``dict``.

Args:
metadata_dict (dict):
Python dictionary representing a ``SingleTableMetadata`` object.

Returns:
Instance of ``SingleTableMetadata``.
"""
instance = cls()
for key in instance._KEYS:
value = deepcopy(metadata_dict.get(key))
if value:
setattr(instance, f"{key}", value)

return instance

# ----------------------------------------------------------------------
MooooCat marked this conversation as resolved.
Show resolved Hide resolved

@property
def tag_fields(self) -> Iterable[str]:
Expand Down
Loading