Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/adjust static embedding experiments #270

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions embeddings/config/flair_config_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
class FlairTextClassificationConfigSpaceMapping:
LOAD_MODEL_KEYS_MAPPING: ClassVar[Mapping[str, Set[str]]] = MappingProxyType(
{
"FlairDocumentCNNEmbeddings": {
"FlairDocumentRNNEmbeddings": {
"hidden_size",
"rnn_type",
"rnn_layers",
Expand All @@ -27,8 +27,8 @@ class FlairTextClassificationConfigSpaceMapping:
"word_dropout",
"reproject_words",
},
"FlairDocumentRNNEmbeddings": {
"cnn_pool_kernels",
"FlairDocumentCNNEmbeddings": {
"kernels",
"dropout",
"word_dropout",
"reproject_words",
Expand All @@ -38,7 +38,7 @@ class FlairTextClassificationConfigSpaceMapping:
}
)
LOAD_MODEL_KEYS: ClassVar[Set[str]] = {
"cnn_pool_kernels",
"kernels",
"fine_tune_mode",
"reproject_words",
"pooling",
Expand Down Expand Up @@ -247,12 +247,13 @@ class FlairTextClassificationConfigSpace(
dynamic_fine_tune: Parameter = SearchableParameter(
name="fine_tune", type="categorical", choices=[False, True]
)
# Choices to Optuna can only take primitives;
# This parameter results in Optuna warning but the library works properly
cnn_pool_kernels: Parameter = SearchableParameter(
# CNN pooling kernels
kernels: Parameter = SearchableParameter(
name="kernels",
type="categorical",
choices=[((100, 3), (100, 4), (100, 5)), ((200, 4), (200, 5), (200, 6))],
# Choices to Optuna can only take primitives;
# This parameter results in Optuna warning but the library works properly
)
hidden_size: Parameter = SearchableParameter(
name="hidden_size", type="int_uniform", low=128, high=2048, step=128
Expand Down
25 changes: 17 additions & 8 deletions embeddings/embedding/sklearn_embedding.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,33 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, TypeVar, Union

import pandas as pd
from sklearn.base import BaseEstimator as AnySklearnVectorizer
import scipy
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import _VectorizerMixin

from embeddings.embedding.embedding import Embedding
from embeddings.embedding.vectorizer.vectorizer import Vectorizer
from embeddings.utils.array_like import ArrayLike

SklearnVectorizer = TypeVar(
"SklearnVectorizer", bound=Union[Vectorizer, _VectorizerMixin, BaseEstimator]
)


class SklearnEmbedding(Embedding[ArrayLike, pd.DataFrame]):
def __init__(
self, vectorizer: AnySklearnVectorizer, embedding_kwargs: Optional[Dict[str, Any]] = None
self, vectorizer: SklearnVectorizer, vectorizer_kwargs: Optional[Dict[str, Any]] = None
):
super().__init__()
self.embedding_kwargs = embedding_kwargs if embedding_kwargs else {}
self.vectorizer = vectorizer(**self.embedding_kwargs)
assert callable(vectorizer)
self.vectorizer_kwargs = vectorizer_kwargs if vectorizer_kwargs else {}
self.vectorizer = vectorizer(**self.vectorizer_kwargs)

def fit(self, data: ArrayLike) -> None:
self.vectorizer.fit(data)

def embed(self, data: ArrayLike) -> pd.DataFrame:
return pd.DataFrame(
self.vectorizer.transform(data).A, columns=self.vectorizer.get_feature_names_out()
)
embedded = self.vectorizer.transform(data)
if scipy.sparse.issparse(embedded):
embedded = embedded.A
return pd.DataFrame(embedded)
Empty file.
35 changes: 35 additions & 0 deletions embeddings/embedding/vectorizer/flair.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import abc
from typing import Any, Dict, Generic, List, Optional

import numpy as np
from flair.data import Sentence
from numpy import typing as nptyping

from embeddings.embedding.flair_embedding import FlairEmbedding
from embeddings.embedding.vectorizer.vectorizer import Output, Vectorizer
from embeddings.utils.array_like import ArrayLike


class FlairVectorizer(Vectorizer[FlairEmbedding, Output], abc.ABC, Generic[Output]):
def fit(self, x: ArrayLike, y: Optional[ArrayLike] = None) -> None:
pass

def fit_transform(self, x: ArrayLike, y: Optional[ArrayLike] = None, **kwargs: Any) -> Output:
return self.transform(x)


class FlairDocumentVectorizer(FlairVectorizer[nptyping.NDArray[np.float_]]):
def transform(self, x: ArrayLike) -> nptyping.NDArray[np.float_]:
sentences = [Sentence(example) for example in x]
embeddings = [sentence.embedding.numpy() for sentence in self.embedding.embed(sentences)]
return np.vstack(embeddings)


class FlairWordVectorizer(FlairVectorizer[List[List[Dict[int, float]]]]):
def transform(self, x: ArrayLike) -> List[List[Dict[int, float]]]:
sentences = [Sentence(example) for example in x]
embeddings = [sentence for sentence in self.embedding.embed(sentences)]
return [
[{i: value for i, value in enumerate(word.embedding.numpy())} for word in sent]
for sent in embeddings
]
34 changes: 34 additions & 0 deletions embeddings/embedding/vectorizer/vectorizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import abc
from typing import Any, Generic, Optional, TypeVar

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import _VectorizerMixin

from embeddings.utils.array_like import ArrayLike

Output = TypeVar("Output")
Embedding = TypeVar("Embedding")


# ignoring the mypy error due to no types (Any) in TransformerMixin and BaseEstimator classes
class Vectorizer(
TransformerMixin, # type: ignore
_VectorizerMixin, # type: ignore
BaseEstimator, # type: ignore
abc.ABC,
Generic[Embedding, Output],
):
def __init__(self, embedding: Embedding) -> None:
self.embedding = embedding

@abc.abstractmethod
def fit(self, x: ArrayLike, y: Optional[ArrayLike] = None) -> None:
pass

@abc.abstractmethod
def transform(self, x: ArrayLike) -> Output:
pass

@abc.abstractmethod
def fit_transform(self, x: ArrayLike, y: Optional[ArrayLike] = None, **kwargs: Any) -> Output:
pass
6 changes: 3 additions & 3 deletions embeddings/pipeline/hps_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from embeddings.pipeline.preprocessing_pipeline import PreprocessingPipeline
from embeddings.pipeline.standard_pipeline import LoaderResult, ModelResult, TransformationResult
from embeddings.utils.hps_persister import HPSResultsPersister
from embeddings.utils.loggers import LightningLoggingConfig
from embeddings.utils.loggers import LightningLoggingConfig, LoggingConfig
from embeddings.utils.utils import standardize_name

EvaluationResult = TypeVar("EvaluationResult", bound=EvaluationResults)
Expand Down Expand Up @@ -57,7 +57,7 @@ def persisting(
self,
best_params_path: T_path,
log_path: T_path,
logging_config: LightningLoggingConfig = LightningLoggingConfig(),
logging_config: Optional[LoggingConfig] = LightningLoggingConfig(),
logging_hps_summary_name: Optional[str] = None,
) -> "PersistingPipeline[Metadata]":
return PersistingPipeline(
Expand All @@ -71,7 +71,7 @@ def __init__(
base_pipeline: OptimizedPipeline[Metadata],
best_params_path: T_path,
log_path: T_path,
logging_config: LightningLoggingConfig = LightningLoggingConfig(),
logging_config: Optional[LoggingConfig] = LightningLoggingConfig(),
logging_hps_summary_name: Optional[str] = None,
):
self.base_pipeline = base_pipeline
Expand Down
4 changes: 2 additions & 2 deletions embeddings/pipeline/sklearn_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(
evaluation_filename: str = "evaluation.json",
predict_subset: Literal["dev", "validation", "test"] = "test",
classifier_kwargs: Optional[Dict[str, Any]] = None,
embedding_kwargs: Optional[Dict[str, Any]] = None,
vectorizer_kwargs: Optional[Dict[str, Any]] = None,
load_dataset_kwargs: Optional[Dict[str, Any]] = None,
):
dataset = Dataset(
Expand All @@ -63,7 +63,7 @@ def __init__(
RenameInputColumnsTransformation(input_column_name, target_column_name)
)
classifier_kwargs = classifier_kwargs if classifier_kwargs else {}
embedding = SklearnEmbedding(embedding_kwargs=embedding_kwargs, vectorizer=vectorizer)
embedding = SklearnEmbedding(vectorizer_kwargs=vectorizer_kwargs, vectorizer=vectorizer)
task = TextClassification(classifier=classifier, classifier_kwargs=classifier_kwargs)
model = SklearnModel(embedding, task, predict_subset=predict_subset)
output_path = Path(output_path)
Expand Down
9 changes: 6 additions & 3 deletions embeddings/task/sklearn_task/text_classification.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
from typing import Any, Dict, Optional
from typing import Any, Dict, Optional, TypeVar

import pandas as pd
from sklearn.base import ClassifierMixin as AnySklearnClassifier
from sklearn.base import ClassifierMixin

from embeddings.embedding.sklearn_embedding import SklearnEmbedding
from embeddings.evaluator.evaluation_results import Predictions
from embeddings.task.sklearn_task.sklearn_task import SklearnTask
from embeddings.utils.array_like import ArrayLike

SklearnClassifier = TypeVar("SklearnClassifier", bound=ClassifierMixin)


class TextClassification(SklearnTask):
def __init__(
self,
classifier: AnySklearnClassifier,
classifier: SklearnClassifier,
classifier_kwargs: Optional[Dict[str, Any]] = None,
):
super().__init__()
assert callable(classifier)
self.classifier_kwargs = classifier_kwargs if classifier_kwargs else {}
self.classifier = classifier(**self.classifier_kwargs)

Expand Down
9 changes: 6 additions & 3 deletions embeddings/utils/hps_persister.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from embeddings.data.io import T_path
from embeddings.pipeline.pipelines_metadata import Metadata
from embeddings.utils.loggers import LightningLoggingConfig, WandbWrapper
from embeddings.utils.loggers import LightningLoggingConfig, LoggingConfig, WandbWrapper
from embeddings.utils.results_persister import ResultsPersister
from embeddings.utils.utils import standardize_name

Expand All @@ -16,15 +16,18 @@
class HPSResultsPersister(ResultsPersister[Tuple[pd.DataFrame, Metadata]], Generic[Metadata]):
best_params_path: T_path
log_path: T_path
logging_config: LightningLoggingConfig = LightningLoggingConfig()
logging_config: Optional[LoggingConfig] = LightningLoggingConfig()
logging_hps_summary_name: Optional[str] = None

def persist(self, result: Tuple[pd.DataFrame, Metadata], **kwargs: Any) -> None:
log, metadata = result
log.to_pickle(self.log_path)
with open(self.best_params_path, "w") as f:
yaml.dump(data=metadata, stream=f)
if self.logging_config.use_wandb():
if (
isinstance(self.logging_config, LightningLoggingConfig)
and self.logging_config.use_wandb()
):
general_metadata = deepcopy(metadata)
del general_metadata["config"]
logger = WandbWrapper()
Expand Down
7 changes: 6 additions & 1 deletion embeddings/utils/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,12 @@ def get_logger(name: str, log_level: Union[str, int] = DEFAULT_LOG_LEVEL) -> log


@dataclass
class LightningLoggingConfig:
class LoggingConfig(abc.ABC):
pass


@dataclass
class LightningLoggingConfig(LoggingConfig):
loggers_names: List[Literal["wandb", "csv", "tensorboard"]] = field(default_factory=list)
tracking_project_name: Optional[str] = None
wandb_entity: Optional[str] = None
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ module = [
"spacy",
"appdirs",
"dataset.arrow_dataset",
"seqeval.*"
"seqeval.*",
"scipy"
]
ignore_missing_imports = true

Expand Down
Loading