Skip to content

Commit

Permalink
remove ParameterizedEmbedding and ParameterizedLinear
Browse files Browse the repository at this point in the history
Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
  • Loading branch information
fabianlim authored and n1hility committed Jun 21, 2024
1 parent a478a42 commit 0b7367b
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 124 deletions.
8 changes: 5 additions & 3 deletions src/instructlab/dolomite/hf_models/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from transformers import PretrainedConfig

from .enums import AttentionHeadType, InitMethod, PositionEmbeddingType
from .enums import AttentionHeadType, PositionEmbeddingType


class CommonConfig(PretrainedConfig):
Expand All @@ -12,6 +12,9 @@ class CommonConfig(PretrainedConfig):
"num_hidden_layers": "n_layer",
}

# NOTE: initializer range is kept for backward compatiblity
# but it is not used anymore

def __init__(
self,
vocab_size: int = 50257,
Expand All @@ -28,7 +31,7 @@ def __init__(
attn_pdrop: float = 0.1,
normalization_function: str = "layernorm",
layer_norm_epsilon: float = 1e-5,
initializer_range: float = 0.02,
initializer_range: float = 0.02,
scale_attn_weights: bool = True,
attention_multiplier: float = None,
use_cache: bool = True,
Expand Down Expand Up @@ -82,7 +85,6 @@ def __init__(
assert self.scale_attn_weights

# check if enums are valid
init_method = InitMethod(init_method)
attention_head_type = AttentionHeadType(attention_head_type)
position_embedding_type = PositionEmbeddingType(position_embedding_type)

Expand Down
5 changes: 0 additions & 5 deletions src/instructlab/dolomite/hf_models/enums.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,6 @@
from enum import Enum


class InitMethod(Enum):
normal = "normal"
mup = "mup"


class PositionEmbeddingType(Enum):
"""
Enum class for position embeddings
Expand Down
4 changes: 2 additions & 2 deletions src/instructlab/dolomite/hf_models/modeling_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
repeat_key_value,
split_query_key_value_tensor_for_attention,
)
from .embedding import ParameterizedEmbedding
from .linear import ParameterizedLinear
from .embedding import Embedding
from .linear import Linear
from .normalization import RMSNorm, get_normalization_function
from .position_embedding import Alibi, RoPE, YaRNScaledRoPE, apply_rotary_pos_emb
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from transformers import DynamicCache

from ...config import CommonConfig
from ...enums import AttentionHeadType, InitMethod, PositionEmbeddingType
from ..linear import ParameterizedLinear
from ...enums import AttentionHeadType, PositionEmbeddingType
from ..linear import Linear
from ..position_embedding import apply_rotary_pos_emb
from .utils import repeat_key_value

Expand All @@ -23,11 +23,6 @@ def __init__(self, config: CommonConfig, causal: bool, layer_idx: int = None) ->
self.num_key_value_heads = config.num_key_value_heads
self.add_bias = config.add_bias

initializer_range = config.initializer_range
m_width = config.m_width
n_layer = config.n_layer
init_method = InitMethod(config.init_method)

assert (
self.hidden_size % self.num_heads == 0
), f"`hidden_size` ({self.hidden_size}) must be divisible by `num_heads` ({self.num_heads})"
Expand Down Expand Up @@ -71,20 +66,13 @@ def __init__(self, config: CommonConfig, causal: bool, layer_idx: int = None) ->

# note that the actual layout is different for the output and depends on whether we are using MHA, MQA or GQA
# (self.hidden_size + 2 * self.num_key_value_heads * self.head_dim) is just the actual number output features
std = initializer_range
if init_method == InitMethod.mup:
std /= math.sqrt(m_width)
self.c_attn = ParameterizedLinear(
self.c_attn = Linear(
self.hidden_size,
self.hidden_size + 2 * self.num_key_value_heads * self.head_dim,
bias=self.add_bias,
std=std,
)

std = initializer_range / math.sqrt(2 * n_layer)
if init_method == InitMethod.mup:
std /= math.sqrt(m_width)
self.c_proj = ParameterizedLinear(self.hidden_size, self.hidden_size, bias=self.add_bias, std=std)
self.c_proj = Linear(self.hidden_size, self.hidden_size, bias=self.add_bias)

self.attn_pdrop = config.attn_pdrop
self.resid_pdrop = config.resid_pdrop
Expand Down
45 changes: 2 additions & 43 deletions src/instructlab/dolomite/hf_models/modeling_utils/embedding.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,3 @@
import torch
import torch.nn as nn
from torch.nn import Embedding


class ParameterizedEmbedding(nn.Embedding):
def __init__(
self,
num_embeddings: int,
embedding_dim: int,
padding_idx: int | None = None,
max_norm: float | None = None,
norm_type: float = 2,
scale_grad_by_freq: bool = False,
sparse: bool = False,
_weight: torch.Tensor | None = None,
_freeze: bool = False,
device=None,
dtype=None,
std=None,
) -> None:
self.std = std
super().__init__(
num_embeddings,
embedding_dim,
padding_idx,
max_norm,
norm_type,
scale_grad_by_freq,
sparse,
_weight,
_freeze,
device,
dtype,
)

@torch.no_grad()
def reset_parameters(self) -> None:
if self.std is None:
super().reset_parameters()
else:
# nn.init.trunc_normal_(self.weight, mean=0, std=self.std)
self.weight.data.normal_(mean=0, std=self.std)
if self.padding_idx is not None:
self.weight.data[self.padding_idx].zero_()
# NOTE: we have replaced ParameterizedEmbedding with torch.nn.Embedding
26 changes: 2 additions & 24 deletions src/instructlab/dolomite/hf_models/modeling_utils/linear.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,3 @@
import torch
import torch.nn as nn
from torch.nn import Linear


class ParameterizedLinear(nn.Linear):
def __init__(
self,
in_features: int,
out_features: int,
bias: bool = True,
device=None,
dtype=None,
std=None,
) -> None:
self.std = std
super().__init__(in_features, out_features, bias, device, dtype)

@torch.no_grad()
def reset_parameters(self) -> None:
if self.std is None:
super().reset_parameters()
else:
nn.init.normal_(self.weight, mean=0, std=self.std)
if self.bias is not None:
self.bias.zero_()
# NOTE: we have replaced ParameterizedLinear with torch.nn.Linear
15 changes: 7 additions & 8 deletions src/instructlab/dolomite/hf_models/models/gpt_dolomite/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
from ...enums import AttentionHeadType, PositionEmbeddingType
from ...modeling_utils import (
Alibi,
ParameterizedEmbedding,
ParameterizedLinear,
Embedding,
Linear,
RMSNorm,
RoPE,
YaRNScaledRoPE,
Expand Down Expand Up @@ -73,7 +73,7 @@ def __init__(self, config: GPTDolomiteConfig, *inputs, **kwargs):
self.upcast_logits_for_loss = config.upcast_logits_for_loss

def _init_weights(self, module: nn.Module) -> None:
if isinstance(module, (ParameterizedEmbedding, ParameterizedLinear, nn.LayerNorm, RMSNorm, Alibi, RoPE)):
if isinstance(module, (Embedding, Linear, nn.LayerNorm, RMSNorm, Alibi, RoPE)):
module.reset_parameters()

def get_autoregressive_language_modeling_loss(
Expand Down Expand Up @@ -185,15 +185,14 @@ def __init__(self, config: GPTDolomiteConfig, **kwargs) -> None:
self.num_heads = config.num_attention_heads
self.num_key_value_heads = config.num_key_value_heads
self.m_emb = config.m_emb
self.initializer_range = config.initializer_range

assert (
self.embed_dim % self.num_heads == 0
), f"`embed_dim` ({self.embed_dim}) must be divisible by `num_heads` ({self.num_heads})"

self.head_dim = self.embed_dim // self.num_heads

self.wte = ParameterizedEmbedding(config.vocab_size, self.embed_dim, std=self.initializer_range)
self.wte = Embedding(config.vocab_size, self.embed_dim)

self.drop = nn.Identity() if config.embd_pdrop == 0 else nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList(
Expand Down Expand Up @@ -221,10 +220,10 @@ def __init__(self, config: GPTDolomiteConfig, **kwargs) -> None:
# Initialize weights and apply final processing
self.post_init()

def get_input_embeddings(self) -> ParameterizedEmbedding:
def get_input_embeddings(self) -> Embedding:
return self.wte

def set_input_embeddings(self, new_embeddings: ParameterizedEmbedding) -> None:
def set_input_embeddings(self, new_embeddings: Embedding) -> None:
self.wte = new_embeddings

def forward(
Expand Down Expand Up @@ -605,7 +604,7 @@ def _setup_positional_encoding(self) -> None:
max_position_embeddings = self.config.max_position_embeddings

if self.position_embedding_type == PositionEmbeddingType.learned_absolute:
self.wpe = ParameterizedEmbedding(max_position_embeddings, self.embed_dim, std=self.initializer_range)
self.wpe = Embedding(max_position_embeddings, self.embed_dim)
elif self.position_embedding_type == PositionEmbeddingType.alibi:
assert not self._use_flash_attention_2, "alibi is not implemented with FlashAttention"

Expand Down
14 changes: 7 additions & 7 deletions src/instructlab/dolomite/hf_models/models/gpt_dolomite/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from transformers import DynamicCache
from transformers.modeling_outputs import CausalLMOutputWithPast

from ...modeling_utils import ParameterizedEmbedding, ParameterizedLinear
from ...modeling_utils import Embedding, Linear
from .base import GPTDolomiteModel, GPTDolomitePreTrainedModel
from .config import GPTDolomiteConfig

Expand All @@ -18,26 +18,26 @@ def __init__(self, config: GPTDolomiteConfig, **kwargs) -> None:
self.transformer = GPTDolomiteModel(config, **kwargs)

if not self._tied_word_embeddings:
self.lm_head = ParameterizedLinear(
config.n_embd, config.vocab_size, bias=False, std=config.initializer_range
self.lm_head = Linear(
config.n_embd, config.vocab_size, bias=False
)

self.m_width = config.m_width

# Initialize weights and apply final processing
self.post_init()

def get_input_embeddings(self) -> ParameterizedEmbedding:
def get_input_embeddings(self) -> Embedding:
return self.transformer.wte

def set_input_embeddings(self, value: ParameterizedEmbedding) -> None:
def set_input_embeddings(self, value: Embedding) -> None:
self.transformer.wte = value

def get_output_embeddings(self) -> ParameterizedLinear:
def get_output_embeddings(self) -> Linear:
if not self._tied_word_embeddings:
return self.lm_head

def set_output_embeddings(self, new_embeddings: ParameterizedLinear) -> None:
def set_output_embeddings(self, new_embeddings: Linear) -> None:
if not self._tied_word_embeddings:
self.lm_head = new_embeddings

Expand Down
19 changes: 3 additions & 16 deletions src/instructlab/dolomite/hf_models/models/gpt_dolomite/mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import torch
import torch.nn as nn

from ...enums import InitMethod
from ...modeling_utils import ParameterizedLinear, get_activation_function, is_glu
from ...modeling_utils import Linear, get_activation_function, is_glu
from .config import GPTDolomiteConfig


Expand All @@ -19,27 +18,15 @@ def __init__(self, config: GPTDolomiteConfig) -> None:
add_bias = config.add_bias
residual_dropout = config.resid_pdrop

init_method = InitMethod(config.init_method)
initializer_range = config.initializer_range
m_width = config.m_width
n_layer = config.n_layer

std = initializer_range
if init_method == InitMethod.mup:
std /= math.sqrt(m_width)
self.c_fc = ParameterizedLinear(
self.c_fc = Linear(
hidden_size,
2 * intermediate_size if is_glu(activation_function) else intermediate_size,
bias=add_bias,
std=std,
)

self.act = get_activation_function(activation_function)

std = initializer_range / math.sqrt(2 * n_layer)
if init_method == InitMethod.mup:
std /= math.sqrt(m_width)
self.c_proj = ParameterizedLinear(intermediate_size, hidden_size, bias=add_bias, std=std)
self.c_proj = Linear(intermediate_size, hidden_size, bias=add_bias)

self.dropout = nn.Identity() if residual_dropout == 0 else nn.Dropout(residual_dropout)

Expand Down

0 comments on commit 0b7367b

Please sign in to comment.