Skip to content

Commit

Permalink
Add huggingface based models exporter (#289)
Browse files Browse the repository at this point in the history
* feat(models): Add huggingface based models exporter

* fix(mypy): Resolve mypy issues

* feat(exporter): Add ONNX exporter

* project(dependencies): Update project dependencies

* tests(exporter, pipeline): Add model export tests and add pipeline task name test

* refactor(exporter): Refactor exporter and tests after review

* feat(pipeline): Update QA pipeline

* refactor(pyproject.toml): Resolve missing import issue

* refactor(tests): Refactor tests

* refactor(black): Refactor styling

* feat(tests): Update sequence labeling tests and fix onnx exporter

* feat(pyproject.toml): Update pyproject.toml

* fix: Fix issues after refactor

* feat: Add EvaluationResult __call__ method

* fix(tests,exporter): Fix tests and model exporter

* feat(evaluator): Remove __call__ function and add improved __repr__ method

* fix(tests): Fix sequence labeling test

* fix(QA): Fix qa pipeline

* feat(QA): Fix qa and add exporter tests
  • Loading branch information
ktagowski authored Apr 15, 2023
1 parent 63db321 commit 8d8ec7e
Show file tree
Hide file tree
Showing 25 changed files with 1,884 additions and 654 deletions.
Empty file.
6 changes: 3 additions & 3 deletions embeddings/data/datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def process_data(self) -> None:

def train_dataloader(self) -> DataLoader[HuggingFaceDataset]:
return DataLoader(
dataset=self.dataset["train"], # type: ignore
dataset=self.dataset["train"],
batch_size=self.train_batch_size,
collate_fn=self.collate_fn,
shuffle=True,
Expand All @@ -196,7 +196,7 @@ def train_dataloader(self) -> DataLoader[HuggingFaceDataset]:
def val_dataloader(self) -> Optional[DataLoader[HuggingFaceDataset]]: # type: ignore
if "validation" in self.dataset:
return DataLoader(
dataset=self.dataset["validation"], # type: ignore
dataset=self.dataset["validation"],
batch_size=self.eval_batch_size,
collate_fn=self.collate_fn,
shuffle=False,
Expand All @@ -207,7 +207,7 @@ def val_dataloader(self) -> Optional[DataLoader[HuggingFaceDataset]]: # type: i

def test_dataloader(self) -> DataLoader[HuggingFaceDataset]:
return DataLoader(
dataset=self.dataset["test"], # type: ignore
dataset=self.dataset["test"],
batch_size=self.eval_batch_size,
collate_fn=self.collate_fn,
shuffle=False,
Expand Down
16 changes: 13 additions & 3 deletions embeddings/evaluator/evaluation_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,24 @@ def from_evaluation_json(cls, data: str) -> "Predictions":

@dataclass
class EvaluationResults:
def __repr__(self) -> str:
fields = asdict(self)
fields.pop("data")
return (
self.__class__.__qualname__
+ "("
+ ", ".join([f"{k}={v}" for k, v in fields.items()])
+ ")"
)

@property
def metrics(self) -> Dict[str, Any]:
result = asdict(self)
result.pop("data")
return result


@dataclass
@dataclass(repr=False)
class TextClassificationEvaluationResults(EvaluationResults):
accuracy: float
f1_macro: float
Expand All @@ -68,7 +78,7 @@ class TextClassificationEvaluationResults(EvaluationResults):
data: Optional[Data] = None


@dataclass
@dataclass(repr=False)
class SequenceLabelingEvaluationResults(EvaluationResults):
accuracy: float
f1_macro: float
Expand All @@ -84,7 +94,7 @@ class SequenceLabelingEvaluationResults(EvaluationResults):
data: Optional[Data] = None


@dataclass
@dataclass(repr=False)
class QuestionAnsweringEvaluationResults(EvaluationResults):
exact: float
f1: float
Expand Down
15 changes: 8 additions & 7 deletions embeddings/evaluator/question_answering_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,6 @@
import torch
from numpy import typing as nptyping

from embedding.transformation.lightning_transformation.question_answering_output_transformation import (
QAPredictionPostProcessor,
)
from embeddings.evaluator.evaluation_results import Predictions, QuestionAnsweringEvaluationResults
from embeddings.evaluator.metrics_evaluator import MetricsEvaluator
from embeddings.metric.metric import Metric
Expand All @@ -14,20 +11,24 @@
QA_GOLD_ANSWER_TYPE,
QA_PREDICTED_ANSWER_TYPE,
)
from embeddings.transformation.lightning_transformation.question_answering_output_transformation import (
QAPredictionPostProcessor,
)


class QuestionAnsweringEvaluator(
MetricsEvaluator[Dict[str, Any], QuestionAnsweringEvaluationResults]
):
def __init__(self, no_answer_threshold: float = 1.0):
super().__init__(return_input_data=True)
self.metric = SQUADv2Metric(no_answer_threshold=no_answer_threshold)
self.postprocessor = QAPredictionPostProcessor()

def metrics(
self,
) -> Dict[str, Metric[Union[List[Any], nptyping.NDArray[Any], torch.Tensor], Dict[Any, Any]]]:
return {}

def __init__(self, no_answer_threshold: float = 1.0):
self.metric = SQUADv2Metric(no_answer_threshold=no_answer_threshold)
self.postprocessor = QAPredictionPostProcessor()

def evaluate(
self, data: Union[Dict[str, nptyping.NDArray[Any]], Predictions, Dict[str, Any]]
) -> QuestionAnsweringEvaluationResults:
Expand Down
2 changes: 1 addition & 1 deletion embeddings/metric/seqeval_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _info(self) -> evaluate.MetricInfo:
citation=_CITATION,
homepage="https://github.com/chakki-works/seqeval",
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features( # type: ignore
features=datasets.Features(
{
"predictions": datasets.Sequence(
datasets.Value("string", id="label"), id="sequence"
Expand Down
2 changes: 1 addition & 1 deletion embeddings/model/lightning_module/question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:

def shared_step(self, **batch: Any) -> Any:
outputs = self(**batch)
return {"data": batch, "outputs": outputs}
return {"data": batch, "outputs": dict(outputs.items())}

def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
batch, batch_idx = args
Expand Down
6 changes: 6 additions & 0 deletions embeddings/pipeline/lightning_question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ def __init__(
self,
embedding_name_or_path: T_path,
dataset_name_or_path: T_path,
context_column_name: str,
question_column_name: str,
answer_column_name: str,
output_path: T_path,
evaluation_filename: str = "evaluation.json",
config: LightningQAConfig = LightningQABasicConfig(),
Expand All @@ -44,6 +47,9 @@ def __init__(

datamodule = QuestionAnsweringDataModule(
dataset_name_or_path=dataset_name_or_path,
context_field=context_column_name,
question_field=question_column_name,
target_field=answer_column_name,
tokenizer_name_or_path=tokenizer_name_or_path,
train_batch_size=config.task_model_kwargs["train_batch_size"],
eval_batch_size=config.task_model_kwargs["eval_batch_size"],
Expand Down
6 changes: 6 additions & 0 deletions embeddings/task/lightning_task/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from typing import Union

from .lightning_task import ClassificationLightningTask
from .question_answering import QuestionAnsweringTask

SUPPORTED_HF_TASKS = Union[ClassificationLightningTask, QuestionAnsweringTask]
7 changes: 7 additions & 0 deletions embeddings/task/lightning_task/hf_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from enum import Enum


class HuggingFaceTaskName(Enum):
text_classification = "sequence-classification"
sequence_labeling = "token-classification"
question_answering = "question-answering"
12 changes: 10 additions & 2 deletions embeddings/task/lightning_task/lightning_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pytorch_lightning.callbacks import Callback, ModelCheckpoint
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from torch.utils.data import DataLoader
from transformers import AutoModel
from transformers import AutoModel, AutoTokenizer

from embeddings.data.datamodule import HuggingFaceDataModule
from embeddings.data.dataset import LightingDataModuleSubset
Expand All @@ -15,6 +15,7 @@
from embeddings.evaluator.evaluation_results import Predictions
from embeddings.model.lightning_module.huggingface_module import HuggingFaceLightningModule
from embeddings.model.lightning_module.lightning_module import LightningModule
from embeddings.task.lightning_task.hf_task import HuggingFaceTaskName
from embeddings.task.task import Output, Task
from embeddings.utils.lightning_callbacks.best_epoch_callback import BestEpochCallback
from embeddings.utils.loggers import LightningLoggingConfig, get_logger
Expand All @@ -36,15 +37,19 @@ def __init__(
early_stopping_kwargs: Dict[str, Any],
model_checkpoint_kwargs: Dict[str, Any],
logging_config: LightningLoggingConfig,
hf_task_name: HuggingFaceTaskName,
):
super().__init__()
self.output_path = Path(output_path)

self.output_path: Path = Path(output_path)
self.hf_task_name = hf_task_name
self.task_train_kwargs = task_train_kwargs
self.early_stopping_kwargs = early_stopping_kwargs
self.model_checkpoint_kwargs = model_checkpoint_kwargs
self.model: Optional[HuggingFaceLightningModule] = None
self.trainer: Optional[pl.Trainer] = None
self.logging_config = logging_config
self.tokenizer: Optional[AutoTokenizer] = None

@property
def best_epoch(self) -> Optional[float]:
Expand Down Expand Up @@ -85,6 +90,7 @@ def fit(
) -> None:
if not self.model:
raise self.MODEL_UNDEFINED_EXCEPTION
self.tokenizer = data.tokenizer

callbacks = self._get_callbacks(dataset_subsets=list(data.load_dataset().keys()))
self.trainer = pl.Trainer(
Expand Down Expand Up @@ -149,13 +155,15 @@ def __init__(
early_stopping_kwargs: Dict[str, Any],
model_checkpoint_kwargs: Dict[str, Any],
logging_config: LightningLoggingConfig,
hf_task_name: HuggingFaceTaskName,
):
super().__init__(
output_path=output_path,
task_train_kwargs=task_train_kwargs,
early_stopping_kwargs=early_stopping_kwargs,
model_checkpoint_kwargs=model_checkpoint_kwargs,
logging_config=logging_config,
hf_task_name=hf_task_name,
)

def fit_predict(
Expand Down
2 changes: 2 additions & 0 deletions embeddings/task/lightning_task/question_answering.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from embeddings.data.qa_datamodule import QuestionAnsweringDataModule
from embeddings.model.lightning_module.lightning_module import LightningModule
from embeddings.model.lightning_module.question_answering import QuestionAnsweringModule
from embeddings.task.lightning_task.hf_task import HuggingFaceTaskName
from embeddings.task.lightning_task.lightning_task import LightningTask
from embeddings.utils.loggers import LightningLoggingConfig

Expand Down Expand Up @@ -40,6 +41,7 @@ def __init__(
early_stopping_kwargs=early_stopping_kwargs,
model_checkpoint_kwargs=model_checkpoint_kwargs,
logging_config=LightningLoggingConfig.from_flags(),
hf_task_name=HuggingFaceTaskName.question_answering,
)
self.model_name_or_path = model_name_or_path
self.model_config_kwargs = model_config_kwargs
Expand Down
2 changes: 2 additions & 0 deletions embeddings/task/lightning_task/sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from embeddings.evaluator.evaluation_results import Predictions
from embeddings.metric.sequence_labeling import EvaluationMode, TaggingScheme
from embeddings.model.lightning_module.sequence_labeling import SequenceLabelingModule
from embeddings.task.lightning_task.hf_task import HuggingFaceTaskName
from embeddings.task.lightning_task.lightning_task import ClassificationLightningTask
from embeddings.utils.loggers import LightningLoggingConfig

Expand All @@ -35,6 +36,7 @@ def __init__(
early_stopping_kwargs,
model_checkpoint_kwargs,
logging_config,
hf_task_name=HuggingFaceTaskName.sequence_labeling,
)
self.model_name_or_path = model_name_or_path
self.num_classes = num_classes
Expand Down
2 changes: 2 additions & 0 deletions embeddings/task/lightning_task/text_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from embeddings.data.io import T_path
from embeddings.evaluator.evaluation_results import Predictions
from embeddings.model.lightning_module.text_classification import TextClassificationModule
from embeddings.task.lightning_task.hf_task import HuggingFaceTaskName
from embeddings.task.lightning_task.lightning_task import ClassificationLightningTask
from embeddings.utils.loggers import LightningLoggingConfig

Expand All @@ -30,6 +31,7 @@ def __init__(
early_stopping_kwargs,
model_checkpoint_kwargs,
logging_config,
hf_task_name=HuggingFaceTaskName.text_classification,
)
self.model_name_or_path = model_name_or_path
self.num_classes = num_classes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class SampleSplitsHuggingFaceTransformation(SampleSplitsTransformation):
def _train_test_split(
self, data: datasets.Dataset, test_fraction: float
) -> datasets.DatasetDict:
return data.train_test_split(test_size=test_fraction, seed=self.seed) # type: ignore
return data.train_test_split(test_size=test_fraction, seed=self.seed)


class SampleSplitsStratifiedTransformation(SampleSplitsTransformation):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ def unwrap_outputs_from_batches(
for tensor_key, tensor in batch[key].items():
if tensor_key == "loss":
continue
assert isinstance(tensor, torch.Tensor)
if isinstance(tensor, np.ndarray):
tensor = torch.from_numpy(tensor)
if tensor.dtype in {torch.bfloat16, torch.float16}:
tensor = tensor.to(dtype=torch.float32)
tensors_lists_dict[key][tensor_key].append(tensor)
Expand Down
3 changes: 2 additions & 1 deletion embeddings/utils/hf_persister.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

import datasets

from embeddings.data.io import T_path
from embeddings.utils.results_persister import ResultsPersister


class HuggingFaceDatasetLocalPersister(ResultsPersister[datasets.DatasetDict]):
def __init__(self, path: str):
def __init__(self, path: T_path) -> None:
self.path = path

def persist(self, result: datasets.DatasetDict, **kwargs: Any) -> None:
Expand Down
Loading

0 comments on commit 8d8ec7e

Please sign in to comment.