Skip to content

Commit

Permalink
Phi update 202311 (axolotl-ai-cloud#876)
Browse files Browse the repository at this point in the history
* add phi modeling from hf

* update for packing and use new modeling class for phi

* update e2e tests for phi to use new model name

* update example phi to also use new phi model name

* use AutoModelForCausalLM for phi lora since sample packing isn't supported
  • Loading branch information
winglian committed Nov 17, 2023
1 parent 160fb9e commit c8607d9
Show file tree
Hide file tree
Showing 6 changed files with 1,136 additions and 6 deletions.
2 changes: 1 addition & 1 deletion examples/phi/phi-ft.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
base_model: microsoft/phi-1_5
model_type: MixFormerSequentialForCausalLM
model_type: PhiForCausalLM
tokenizer_type: AutoTokenizer
is_llama_derived_model: false
trust_remote_code: true
Expand Down
2 changes: 2 additions & 0 deletions src/axolotl/models/phi/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@
"""

from .configuration_mixformer_sequential import MixFormerSequentialConfig # noqa
from .configuration_phi import PhiConfig # noqa
from .modeling_mixformer_sequential import MixFormerSequentialForCausalLM # noqa
from .modeling_phi import PhiForCausalLM # noqa
65 changes: 65 additions & 0 deletions src/axolotl/models/phi/configuration_phi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# pylint: skip-file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import math
from typing import Optional

from transformers import PretrainedConfig


class PhiConfig(PretrainedConfig):
"""Phi configuration."""

model_type = "phi"
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}

def __init__(
self,
vocab_size: int = 50304,
n_positions: int = 2048,
n_embd: int = 1024,
n_layer: int = 20,
n_inner: Optional[int] = None,
n_head: int = 16,
n_head_kv: Optional[int] = None,
rotary_dim: Optional[int] = 32,
activation_function: Optional[str] = "gelu_new",
flash_attn: bool = False,
flash_rotary: bool = False,
fused_dense: bool = False,
attn_pdrop: float = 0.0,
embd_pdrop: float = 0.0,
resid_pdrop: float = 0.0,
layer_norm_epsilon: float = 1e-5,
initializer_range: float = 0.02,
tie_word_embeddings: bool = False,
pad_vocab_size_multiple: int = 64,
**kwargs
) -> None:
self.vocab_size = int(
math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
)
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_inner = n_inner
self.n_head = n_head
self.n_head_kv = n_head_kv
self.rotary_dim = min(rotary_dim, n_embd // n_head)
self.activation_function = activation_function
self.flash_attn = flash_attn
self.flash_rotary = flash_rotary
self.fused_dense = fused_dense
self.attn_pdrop = attn_pdrop
self.embd_pdrop = embd_pdrop
self.resid_pdrop = resid_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range

super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
Loading

0 comments on commit c8607d9

Please sign in to comment.