From 845b9f7b063114b012c6b5a9d11dbbbc43f73b59 Mon Sep 17 00:00:00 2001
From: meghagarwal <16129366+megha95@users.noreply.github.com>
Date: Fri, 22 Sep 2023 18:28:35 -0700
Subject: [PATCH 1/3] zero bias conversion corrected

---
 .../utils/checkpoint_conversion_helpers.py    | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py
index 0627cec4cd..b95ebdd8bf 100644
--- a/llmfoundry/utils/checkpoint_conversion_helpers.py
+++ b/llmfoundry/utils/checkpoint_conversion_helpers.py
@@ -95,7 +95,8 @@ def get_hf_tokenizer_from_composer_state_dict(
 
 
 def _write_zero_bias(weight_name: str, weight_file_path: str,
-                     bias_shape: Union[Tuple[int, ...], int]) -> None:
+                     bias_shape: Union[Tuple[int, ...], int],
+                     np_data_type: np.dtype) -> None:
     """Write zeros for bias when converting MPT to FasterTransformer weights.
 
     MPT model might not have bias while FT expects bias.
@@ -111,13 +112,14 @@ def _write_zero_bias(weight_name: str, weight_file_path: str,
         )
     log.debug(f'zero bias for weight: {weight_name}')
     bias_file_path = weight_file_path.replace('.weight', '.bias')
-    bias = np.zeros(bias_shape, dtype=np.float32)
+    bias = np.zeros(bias_shape, dtype=np_data_type)
     bias.tofile(bias_file_path)
 
 
 def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
                                tensor_name: str, config: Dict[str, Any],
-                               data: np.ndarray) -> None:
+                               data: np.ndarray,
+                               np_weight_data_type: np.dtype) -> None:
     """Convert each MPT weight to a FasterTransformer compatible format.
 
     Args:
@@ -138,7 +140,7 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
         save_path = os.path.join(save_dir, f'model.{tensor_name}.bin')
         data.tofile(save_path)
         if 'weight' in tensor_name and config['no_bias']:
-            _write_zero_bias(tensor_name, save_path, data.shape[-1])
+            _write_zero_bias(tensor_name, save_path, data.shape[-1], np_weight_data_type)
 
     elif tensor_name.find('attention.dense.weight') != -1:
         assert data.shape == (
@@ -153,11 +155,11 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
         if config['no_bias']:
             fake_weight_path = os.path.join(save_dir,
                                             f'model.{tensor_name}.bin')
-            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1])
+            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1], np_weight_data_type)
 
     elif tensor_name.find('mlp.dense_4h_to_h.weight') != -1:
         assert data.shape == (
-            config['d_model'], config['mlp_ratio'] *
+            config['d_model'], config['expansion_ratio'] *
             config['d_model']), f'unexpected dim for {tensor_name}'
         # nn.Linear weights are transposed
         data = data.T
@@ -168,11 +170,11 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
         if config['no_bias']:
             fake_weight_path = os.path.join(save_dir,
                                             f'model.{tensor_name}.bin')
-            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1])
+            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1], np_weight_data_type)
 
     elif tensor_name.find('mlp.dense_h_to_4h.weight') != -1:
         assert data.shape == (
-            config['mlp_ratio'] * config['d_model'],
+            config['expansion_ratio'] * config['d_model'],
             config['d_model']), f'unexpected dim for {tensor_name}'
         # nn.Linear weights are transposed
         data = data.T
@@ -183,11 +185,11 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
             split_vals[j].tofile(save_path)
             if config['no_bias']:
                 _write_zero_bias(tensor_name, save_path,
-                                 split_vals[j].shape[-1])
+                                 split_vals[j].shape[-1], np_weight_data_type)
 
     elif tensor_name.find('mlp.dense_h_to_4h.bias') != -1:
         assert data.shape == (
-            config['mlp_ratio'] *
+            config['expansion_ratio'] *
             config['d_model'],), f'unexpected dim for {tensor_name}'
         split_vals = np.split(data, infer_gpu_num, axis=-1)
         for j in range(infer_gpu_num):
@@ -221,7 +223,7 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
             split_vals[j].tofile(save_path)
             if config['no_bias']:
                 _write_zero_bias(tensor_name, save_path,
-                                 (3, split_vals[j].shape[-1]))
+                                 (3, split_vals[j].shape[-1]), np_weight_data_type)
 
     else:
         raise RuntimeError(f'Tensor with name {tensor_name} is not handled')
@@ -289,7 +291,7 @@ def convert_and_save_ft_weights(named_params: dict,
                                      'model.final_layernorm.weight.bin')
             data.tofile(save_path)
             if config['no_bias']:
-                _write_zero_bias(name, save_path, data.shape[-1])
+                _write_zero_bias(name, save_path, data.shape[-1], np_weight_data_type)
         elif name == 'transformer.lm_head.weight':
             data.tofile(os.path.join(save_dir, 'model.lm_head.weight.bin'))
         else:
@@ -299,4 +301,4 @@ def convert_and_save_ft_weights(named_params: dict,
                                             'layers.').replace(
                                                 mpt_pattern, ft_pattern)
                     _convert_weight_to_ft_each(save_dir, infer_gpu_num,
-                                               new_name, config, data)
+                                               new_name, config, data, np_weight_data_type)

From 92a60d9a5bbd1a8e64399e946046a7e2d6503c0a Mon Sep 17 00:00:00 2001
From: meghagarwal <16129366+megha95@users.noreply.github.com>
Date: Thu, 4 Jan 2024 12:37:38 +0000
Subject: [PATCH 2/3] docstring fix

---
 llmfoundry/utils/checkpoint_conversion_helpers.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py
index e541fda94e..bd31b575cf 100644
--- a/llmfoundry/utils/checkpoint_conversion_helpers.py
+++ b/llmfoundry/utils/checkpoint_conversion_helpers.py
@@ -122,6 +122,7 @@ def _write_zero_bias(weight_name: str, weight_file_path: str,
         weight_name (str): Name of the weight tensor.
         weight_file_path (str): Output path for storing the weight (NOT zero bias).
         bias_shape (Union[Tuple[int, ...], int]): Shape of the bias array.
+        np_data_type (np.dtype): The data type for bias.
     """
     if 'weight' not in weight_file_path:
         raise RuntimeError(

From 9bba53552287427611110c5b53a3c4fd05ed14de Mon Sep 17 00:00:00 2001
From: meghagarwal <16129366+megha95@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:23:04 +0000
Subject: [PATCH 3/3] pyright fix

---
 .../utils/checkpoint_conversion_helpers.py    | 40 ++++++++++++++-----
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py
index bd31b575cf..dafeec94e1 100644
--- a/llmfoundry/utils/checkpoint_conversion_helpers.py
+++ b/llmfoundry/utils/checkpoint_conversion_helpers.py
@@ -112,8 +112,8 @@ def load_tokenizer(
 
 
 def _write_zero_bias(weight_name: str, weight_file_path: str,
-                     bias_shape: Union[Tuple[int, ...], int],
-                     np_data_type: np.dtype) -> None:
+                     bias_shape: Union[Tuple[int, ...],
+                                       int], np_data_type: np.dtype) -> None:
     """Write zeros for bias when converting MPT to FasterTransformer weights.
 
     MPT model might not have bias while FT expects bias.
@@ -158,7 +158,9 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
         save_path = os.path.join(save_dir, f'model.{tensor_name}.bin')
         data.tofile(save_path)
         if 'weight' in tensor_name and config['no_bias']:
-            _write_zero_bias(tensor_name, save_path, data.shape[-1], np_weight_data_type)
+            _write_zero_bias(tensor_name, save_path, data.shape[-1],
+                             np_weight_data_type
+                            )  # pyright: ignore [reportGeneralTypeIssues]
 
     elif tensor_name.find('attention.dense.weight') != -1:
         assert data.shape == (
@@ -173,7 +175,9 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
         if config['no_bias']:
             fake_weight_path = os.path.join(save_dir,
                                             f'model.{tensor_name}.bin')
-            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1], np_weight_data_type)
+            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1],
+                             np_weight_data_type
+                            )  # pyright: ignore [reportGeneralTypeIssues]
 
     elif tensor_name.find('mlp.dense_4h_to_h.weight') != -1:
         assert data.shape == (
@@ -188,7 +192,9 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
         if config['no_bias']:
             fake_weight_path = os.path.join(save_dir,
                                             f'model.{tensor_name}.bin')
-            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1], np_weight_data_type)
+            _write_zero_bias(tensor_name, fake_weight_path, data.shape[-1],
+                             np_weight_data_type
+                            )  # pyright: ignore [reportGeneralTypeIssues]
 
     elif tensor_name.find('mlp.dense_h_to_4h.weight') != -1:
         assert data.shape == (
@@ -203,7 +209,8 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
             split_vals[j].tofile(save_path)
             if config['no_bias']:
                 _write_zero_bias(tensor_name, save_path,
-                                 split_vals[j].shape[-1], np_weight_data_type)
+                                 split_vals[j].shape[-1], np_weight_data_type
+                                )  # pyright: ignore [reportGeneralTypeIssues]
 
     elif tensor_name.find('mlp.dense_h_to_4h.bias') != -1:
         assert data.shape == (
@@ -241,7 +248,9 @@ def _convert_weight_to_ft_each(save_dir: str, infer_gpu_num: int,
             split_vals[j].tofile(save_path)
             if config['no_bias']:
                 _write_zero_bias(tensor_name, save_path,
-                                 (3, split_vals[j].shape[-1]), np_weight_data_type)
+                                 (3, split_vals[j].shape[-1]),
+                                 np_weight_data_type
+                                )  # pyright: ignore [reportGeneralTypeIssues]
 
     else:
         raise RuntimeError(f'Tensor with name {tensor_name} is not handled')
@@ -309,7 +318,12 @@ def convert_and_save_ft_weights(named_params: dict,
                                      'model.final_layernorm.weight.bin')
             data.tofile(save_path)
             if config['no_bias']:
-                _write_zero_bias(name, save_path, data.shape[-1], np_weight_data_type)
+                _write_zero_bias(
+                    name,
+                    save_path,
+                    data.shape[-1],
+                    np_weight_data_type  # pyright: ignore [reportGeneralTypeIssues]
+                )
         elif name == 'transformer.lm_head.weight':
             data.tofile(os.path.join(save_dir, 'model.lm_head.weight.bin'))
         else:
@@ -318,5 +332,11 @@ def convert_and_save_ft_weights(named_params: dict,
                     new_name = name.replace('transformer.blocks.',
                                             'layers.').replace(
                                                 mpt_pattern, ft_pattern)
-                    _convert_weight_to_ft_each(save_dir, infer_gpu_num,
-                                               new_name, config, data, np_weight_data_type)
+                    _convert_weight_to_ft_each(
+                        save_dir,
+                        infer_gpu_num,
+                        new_name,
+                        config,
+                        data,
+                        np_weight_data_type  # pyright: ignore [reportGeneralTypeIssues]
+                    )