Skip to content

Commit

Permalink
Merge branch 'main' into fix/fix-inference-tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
ktagowski authored Apr 17, 2023
2 parents 8393b16 + 8d8ec7e commit b2f1180
Show file tree
Hide file tree
Showing 40 changed files with 3,765 additions and 765 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ MANIFEST
*.manifest
*.spec

# macOS
.DS_Store

# Installer logs
pip-log.txt
pip-delete-this-directory.txt
Expand Down Expand Up @@ -148,4 +151,4 @@ webpage/public
_proc
_docs

lepiszcze-submissions/
lepiszcze-submissions/
154 changes: 80 additions & 74 deletions embeddings/_modidx.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,82 @@
# Autogenerated by nbdev

d = { 'settings': { 'branch': 'main',
'doc_baseurl': '/embeddings',
'doc_host': 'https://CLARIN-PL.github.io',
'git_url': 'https://github.com/CLARIN-PL/embeddings',
'lib_path': 'embeddings'},
'syms': { 'embeddings.config.base_config': {},
'embeddings.config.config_space': {},
'embeddings.config.lighting_config_space': {},
'embeddings.config.lightning_config': {},
'embeddings.config.parameters': {},
'embeddings.data.data_collator': {},
'embeddings.data.data_loader': {},
'embeddings.data.datamodule': {},
'embeddings.data.dataset': {},
'embeddings.data.io': {},
'embeddings.defaults': {},
'embeddings.embedding.embedding': {},
'embeddings.embedding.sklearn_embedding': {},
'embeddings.evaluator.evaluation_results': {},
'embeddings.evaluator.evaluator': {},
'embeddings.evaluator.leaderboard': {},
'embeddings.evaluator.metrics_evaluator': {},
'embeddings.evaluator.sequence_labeling_evaluator': {},
'embeddings.evaluator.submission': {},
'embeddings.evaluator.submission_utils': {},
'embeddings.evaluator.text_classification_evaluator': {},
'embeddings.metric.hugging_face_metric': {},
'embeddings.metric.lightning_seqeval_metric': {},
'embeddings.metric.metric': {},
'embeddings.metric.prfs_per_class_metric': {},
'embeddings.metric.seqeval_metric': {},
'embeddings.metric.sequence_labeling': {},
'embeddings.metric.unit_seqeval_metric': {},
'embeddings.model.base_model': {},
'embeddings.model.lightning_model': {},
'embeddings.model.lightning_module.huggingface_module': {},
'embeddings.model.lightning_module.lightning_module': {},
'embeddings.model.lightning_module.sequence_labeling': {},
'embeddings.model.lightning_module.text_classification': {},
'embeddings.model.model': {},
'embeddings.model.sklearn_model': {},
'embeddings.pipeline.hf_preprocessing_pipeline': {},
'embeddings.pipeline.hps_pipeline': {},
'embeddings.pipeline.lightning_classification': {},
'embeddings.pipeline.lightning_hps_pipeline': {},
'embeddings.pipeline.lightning_pipeline': {},
'embeddings.pipeline.lightning_sequence_labeling': {},
'embeddings.pipeline.pipeline': {},
'embeddings.pipeline.pipeline_builder': {},
'embeddings.pipeline.pipelines_metadata': {},
'embeddings.pipeline.preprocessing_pipeline': {},
'embeddings.pipeline.sklearn_classification': {},
'embeddings.pipeline.standard_pipeline': {},
'embeddings.task.lightning_task.lightning_task': {},
'embeddings.task.lightning_task.sequence_labeling': {},
'embeddings.task.lightning_task.text_classification': {},
'embeddings.task.sklearn_task.sklearn_task': {},
'embeddings.task.sklearn_task.text_classification': {},
'embeddings.task.task': {},
'embeddings.transformation.hf_transformation.downsample_transformation': {},
'embeddings.transformation.hf_transformation.drop_subset_transformation': {},
'embeddings.transformation.hf_transformation.sampling_transformation': {},
'embeddings.transformation.hf_transformation.to_pandas_transformation': {},
'embeddings.transformation.pandas_transformation.rename_input_columns_transformation': {},
'embeddings.transformation.transformation': {},
'embeddings.utils.array_like': {},
'embeddings.utils.hf_persister': {},
'embeddings.utils.hps_persister': {},
'embeddings.utils.json_dict_persister': {},
'embeddings.utils.lightning_callbacks.best_epoch_callback': {},
'embeddings.utils.loggers': {},
'embeddings.utils.results_persister': {},
'embeddings.utils.torch_utils': {},
'embeddings.utils.utils': {}}}
d = {
"settings": {
"branch": "main",
"doc_baseurl": "/embeddings",
"doc_host": "https://CLARIN-PL.github.io",
"git_url": "https://github.com/CLARIN-PL/embeddings",
"lib_path": "embeddings",
},
"syms": {
"embeddings.config.base_config": {},
"embeddings.config.config_space": {},
"embeddings.config.lighting_config_space": {},
"embeddings.config.lightning_config": {},
"embeddings.config.parameters": {},
"embeddings.data.data_collator": {},
"embeddings.data.data_loader": {},
"embeddings.data.datamodule": {},
"embeddings.data.dataset": {},
"embeddings.data.io": {},
"embeddings.defaults": {},
"embeddings.embedding.embedding": {},
"embeddings.embedding.sklearn_embedding": {},
"embeddings.evaluator.evaluation_results": {},
"embeddings.evaluator.evaluator": {},
"embeddings.evaluator.leaderboard": {},
"embeddings.evaluator.metrics_evaluator": {},
"embeddings.evaluator.sequence_labeling_evaluator": {},
"embeddings.evaluator.submission": {},
"embeddings.evaluator.submission_utils": {},
"embeddings.evaluator.text_classification_evaluator": {},
"embeddings.metric.hugging_face_metric": {},
"embeddings.metric.lightning_seqeval_metric": {},
"embeddings.metric.metric": {},
"embeddings.metric.prfs_per_class_metric": {},
"embeddings.metric.seqeval_metric": {},
"embeddings.metric.sequence_labeling": {},
"embeddings.metric.unit_seqeval_metric": {},
"embeddings.model.base_model": {},
"embeddings.model.lightning_model": {},
"embeddings.model.lightning_module.huggingface_module": {},
"embeddings.model.lightning_module.lightning_module": {},
"embeddings.model.lightning_module.sequence_labeling": {},
"embeddings.model.lightning_module.text_classification": {},
"embeddings.model.model": {},
"embeddings.model.sklearn_model": {},
"embeddings.pipeline.hf_preprocessing_pipeline": {},
"embeddings.pipeline.hps_pipeline": {},
"embeddings.pipeline.lightning_classification": {},
"embeddings.pipeline.lightning_hps_pipeline": {},
"embeddings.pipeline.lightning_pipeline": {},
"embeddings.pipeline.lightning_sequence_labeling": {},
"embeddings.pipeline.pipeline": {},
"embeddings.pipeline.pipeline_builder": {},
"embeddings.pipeline.pipelines_metadata": {},
"embeddings.pipeline.preprocessing_pipeline": {},
"embeddings.pipeline.sklearn_classification": {},
"embeddings.pipeline.standard_pipeline": {},
"embeddings.task.lightning_task.lightning_task": {},
"embeddings.task.lightning_task.sequence_labeling": {},
"embeddings.task.lightning_task.text_classification": {},
"embeddings.task.sklearn_task.sklearn_task": {},
"embeddings.task.sklearn_task.text_classification": {},
"embeddings.task.task": {},
"embeddings.transformation.hf_transformation.downsample_transformation": {},
"embeddings.transformation.hf_transformation.drop_subset_transformation": {},
"embeddings.transformation.hf_transformation.sampling_transformation": {},
"embeddings.transformation.hf_transformation.to_pandas_transformation": {},
"embeddings.transformation.pandas_transformation.rename_input_columns_transformation": {},
"embeddings.transformation.transformation": {},
"embeddings.utils.array_like": {},
"embeddings.utils.hf_persister": {},
"embeddings.utils.hps_persister": {},
"embeddings.utils.json_dict_persister": {},
"embeddings.utils.lightning_callbacks.best_epoch_callback": {},
"embeddings.utils.loggers": {},
"embeddings.utils.results_persister": {},
"embeddings.utils.torch_utils": {},
"embeddings.utils.utils": {},
},
}
27 changes: 27 additions & 0 deletions embeddings/config/lightning_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,31 @@ def from_basic() -> "LightningAdvancedConfig":
)


@dataclass
class LightningQABasicConfig(LightningBasicConfig):
batch_encoding_kwargs: Dict[str, Any] = field(
default_factory=lambda: {
"padding": "max_length",
"truncation": "only_second",
"return_offsets_mapping": True,
"return_overflowing_tokens": True,
}
)
doc_stride: int = 64
precision: Union[str, int] = 32

@property
def task_model_kwargs(self) -> Dict[str, Any]:
task_model_kwargs = super().task_model_kwargs
task_model_kwargs["doc_stride"] = self.doc_stride
return task_model_kwargs

@property
def task_train_kwargs(self) -> Dict[str, Any]:
task_train_kwargs = super().task_train_kwargs
task_train_kwargs["precision"] = self.precision
return task_train_kwargs


LightningConfig = Union[LightningBasicConfig, LightningAdvancedConfig]
LightningQAConfig = Union[LightningQABasicConfig, LightningAdvancedConfig]
34 changes: 30 additions & 4 deletions embeddings/data/datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from datasets import ClassLabel, Dataset, DatasetDict
from datasets import Sequence as HFSequence
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset as TorchDataset
from transformers import AutoTokenizer, BatchEncoding

from embeddings.data import dataset as embeddings_dataset
Expand All @@ -37,6 +38,17 @@
_logger = get_logger(__name__)


class TorchFromHuggingFaceDataset(TorchDataset[HuggingFaceDataset]):
def __init__(self, dataset: Dataset):
self.dataset = dataset

def __getitem__(self, index: int) -> Any:
return self.dataset[index]

def __len__(self) -> int:
return len(self.dataset)


class BaseDataModule(abc.ABC, pl.LightningDataModule, Generic[Data]):
dataset: Data

Expand Down Expand Up @@ -107,6 +119,20 @@ def _parse_dataset_info(self) -> Tuple[str, str]:

return dataset_info, dataset_version

def predict_dataloader(
self,
) -> Union[DataLoader[HuggingFaceDataset], Sequence[DataLoader[HuggingFaceDataset]]]:
return [
DataLoader(
dataset=TorchFromHuggingFaceDataset(self.dataset[split]),
batch_size=self.eval_batch_size,
collate_fn=self.collate_fn,
shuffle=False,
**self.dataloader_kwargs,
)
for split in self.splits
]

@abc.abstractmethod
def prepare_labels(self) -> None:
pass
Expand Down Expand Up @@ -158,7 +184,7 @@ def process_data(self) -> None:

def train_dataloader(self) -> DataLoader[HuggingFaceDataset]:
return DataLoader(
dataset=self.dataset["train"], # type: ignore
dataset=self.dataset["train"],
batch_size=self.train_batch_size,
collate_fn=self.collate_fn,
shuffle=True,
Expand All @@ -170,7 +196,7 @@ def train_dataloader(self) -> DataLoader[HuggingFaceDataset]:
def val_dataloader(self) -> Optional[DataLoader[HuggingFaceDataset]]: # type: ignore
if "validation" in self.dataset:
return DataLoader(
dataset=self.dataset["validation"], # type: ignore
dataset=self.dataset["validation"],
batch_size=self.eval_batch_size,
collate_fn=self.collate_fn,
shuffle=False,
Expand All @@ -181,7 +207,7 @@ def val_dataloader(self) -> Optional[DataLoader[HuggingFaceDataset]]: # type: i

def test_dataloader(self) -> DataLoader[HuggingFaceDataset]:
return DataLoader(
dataset=self.dataset["test"], # type: ignore
dataset=self.dataset["test"],
batch_size=self.eval_batch_size,
collate_fn=self.collate_fn,
shuffle=False,
Expand All @@ -193,7 +219,7 @@ def get_subset(
) -> Union[LightingDataLoaders, None]:
if subset == "train":
return self.train_dataloader()
elif subset == "dev":
elif subset in ("dev", "validation"):
return self.val_dataloader()
elif subset == "test":
return self.test_dataloader()
Expand Down
Loading

0 comments on commit b2f1180

Please sign in to comment.