From 419b33a0eff3931fd41954e5e07ac7b523391983 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Sat, 27 Jul 2024 11:03:24 +0000 Subject: [PATCH 1/9] First prototype, let's jump padding free --- convert_hf_nanotron.ipynb | 764 ++++++++++++++++++++ examples/config_llama_sft.yaml | 97 +++ run_train.py | 30 +- src/nanotron/config/config.py | 18 +- src/nanotron/data/chat_dataset.py | 139 ++++ src/nanotron/data/chat_tokenizer.py | 81 +++ src/nanotron/data/collator.py | 89 ++- src/nanotron/data/dataloader_builder.py | 35 +- src/nanotron/models/llama_sft.py | 888 ++++++++++++++++++++++++ src/nanotron/trainer.py | 11 +- 10 files changed, 2141 insertions(+), 11 deletions(-) create mode 100644 convert_hf_nanotron.ipynb create mode 100644 examples/config_llama_sft.yaml create mode 100644 src/nanotron/data/chat_dataset.py create mode 100644 src/nanotron/data/chat_tokenizer.py create mode 100644 src/nanotron/models/llama_sft.py diff --git a/convert_hf_nanotron.ipynb b/convert_hf_nanotron.ipynb new file mode 100644 index 00000000..943b1af9 --- /dev/null +++ b/convert_hf_nanotron.ipynb @@ -0,0 +1,764 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from torch.testing import assert_close\n", + "\n", + "import os\n", + "\n", + "dtype = torch.bfloat16\n", + "device = torch.device(\"cuda\")\n", + "\n", + "os.environ[\"WORLD_SIZE\"] = \"1\"\n", + "os.environ[\"RANK\"] = \"0\"\n", + "os.environ[\"MASTER_ADDR\"] = \"0.0.0.0\"\n", + "os.environ[\"MASTER_PORT\"] = \"6000\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/solergib/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n", + "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 13.70it/s]\n" + ] + } + ], + "source": [ + "from transformers import AutoModelForCausalLM\n", + "PATH_TO_LLAMA = \"/mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct\"\n", + "hf_model = AutoModelForCausalLM.from_pretrained(PATH_TO_LLAMA, torch_dtype=dtype, attn_implementation=\"flash_attention_2\").to(device)\n", + "# print(hf_model)\n", + "# print(hf_model.config)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LlamaConfig {\n", + " \"architectures\": [\n", + " \"LlamaForCausalLM\"\n", + " ],\n", + " \"attention_bias\": false,\n", + " \"attention_dropout\": 0.0,\n", + " \"bos_token_id\": 128000,\n", + " \"eos_token_id\": 128001,\n", + " \"hidden_act\": \"silu\",\n", + " \"hidden_size\": 4096,\n", + " \"initializer_range\": 0.02,\n", + " \"intermediate_size\": 14336,\n", + " \"max_position_embeddings\": 8192,\n", + " \"mlp_bias\": false,\n", + " \"model_type\": \"llama\",\n", + " \"num_attention_heads\": 32,\n", + " \"num_hidden_layers\": 32,\n", + " \"num_key_value_heads\": 8,\n", + " \"pretraining_tp\": 1,\n", + " \"rms_norm_eps\": 1e-05,\n", + " \"rope_scaling\": null,\n", + " \"rope_theta\": 500000.0,\n", + " \"tie_word_embeddings\": false,\n", + " \"torch_dtype\": \"bfloat16\",\n", + " \"transformers_version\": \"4.44.0.dev0\",\n", + " \"use_cache\": true,\n", + " \"vocab_size\": 128256\n", + "}\n", + "\n" + ] + } + ], + "source": [ + "from transformers import LlamaConfig\n", + "hf_config = LlamaConfig.from_pretrained(PATH_TO_LLAMA)\n", + "print(hf_config)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from nanotron.config import ParallelismArgs\n", + "from nanotron.parallel import ParallelContext\n", + "from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine\n", + "from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode\n", + "\n", + "DP = 1\n", + "PP = 1\n", + "TP = 1\n", + "\n", + "parallel_config = ParallelismArgs(\n", + " dp=DP,\n", + " pp=PP,\n", + " tp=TP,\n", + " pp_engine=AllForwardAllBackwardPipelineEngine(),\n", + " tp_mode=TensorParallelLinearMode.ALL_REDUCE,\n", + " tp_linear_async_communication=False,\n", + ")\n", + "assert (\n", + " parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE\n", + " and parallel_config.tp_linear_async_communication is False\n", + ")\n", + "\n", + "parallel_context = ParallelContext(\n", + " data_parallel_size=parallel_config.dp,\n", + " pipeline_parallel_size=parallel_config.pp,\n", + " tensor_parallel_size=parallel_config.tp,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron\n", + "\n", + "nanotron_config = LlamaConfigNanotron(\n", + " bos_token_id=hf_config.bos_token_id,\n", + " eos_token_id=hf_config.eos_token_id,\n", + " hidden_act=hf_config.hidden_act,\n", + " hidden_size=hf_config.hidden_size,\n", + " initializer_range=hf_config.initializer_range,\n", + " intermediate_size=hf_config.intermediate_size,\n", + " is_llama_config=True,\n", + " max_position_embeddings=hf_config.max_position_embeddings,\n", + " num_attention_heads=hf_config.num_attention_heads,\n", + " num_hidden_layers=hf_config.num_hidden_layers,\n", + " num_key_value_heads=hf_config.num_key_value_heads,\n", + " pad_token_id=None,\n", + " pretraining_tp=hf_config.pretraining_tp,\n", + " rms_norm_eps=hf_config.rms_norm_eps,\n", + " rope_scaling=hf_config.rope_scaling,\n", + " rope_theta=hf_config.rope_theta,\n", + " rope_interleaved=False,\n", + " tie_word_embeddings=hf_config.tie_word_embeddings,\n", + " use_cache=hf_config.use_cache,\n", + " vocab_size=hf_config.vocab_size,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from nanotron.models.llama_sft import LlamaForSFT\n", + "from nanotron.models import build_model\n", + "\n", + "nanotron_model = build_model(\n", + " model_builder=lambda: LlamaForSFT(\n", + " config=nanotron_config,\n", + " parallel_context=parallel_context,\n", + " parallel_config=parallel_config,\n", + " random_states=None,\n", + " ),\n", + " parallel_context=parallel_context,\n", + " dtype=dtype,\n", + " device=device,\n", + ")\n", + "# print(nanotron_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from nanotron.trainer import mark_tied_parameters\n", + "\n", + "mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ShardedInfo(global_ranks=(0,), local_global_slices_pairs=(SlicesPair(local_slices=(slice(None, None, None), slice(None, None, None)), global_slices=(slice(0, 128256, None), slice(None, None, None))),), unsharded_shape=(128256, 4096))" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.get_sharded_info()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.is_tied" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Final script\n", + "# TODO Añadir variables de TP para splitear los parametros de las layers de HF\n", + "# TODO Cargar modelo HF en cpu y copiar desde ahi\n", + "\n", + "\n", + "# Token embeddings\n", + "assert nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape == hf_model.model.embed_tokens.weight.shape\n", + "\n", + "with torch.no_grad():\n", + " nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.copy_(hf_model.model.embed_tokens.weight)# = hf_model.model.embed_tokens.weight.data\n", + "\n", + "# Decoder layers\n", + "for i in range(nanotron_config.num_hidden_layers):\n", + " # Input layer norm\n", + " assert hf_model.model.layers[i].input_layernorm.weight.shape == nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.shape\n", + " with torch.no_grad():\n", + " nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.copy_(hf_model.model.layers[i].input_layernorm.weight)# = hf_model.model.layers[i].input_layernorm.weight\n", + " # Self attn\n", + " ## QKV\n", + " tmp_qkv_proj = torch.cat([\n", + " hf_model.model.layers[i].self_attn.q_proj.weight,\n", + " hf_model.model.layers[i].self_attn.k_proj.weight,\n", + " hf_model.model.layers[i].self_attn.v_proj.weight\n", + " ], dim = 0) \n", + " assert tmp_qkv_proj.shape == nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.shape\n", + " with torch.no_grad():\n", + " nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.copy_(tmp_qkv_proj)# = tmp_qkv_proj # torch.nn.Parameter(tmp_qkv_proj)\n", + " \n", + " ## O\n", + " assert hf_model.model.layers[i].self_attn.o_proj.weight.shape == nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.shape\n", + " with torch.no_grad():\n", + " nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.copy_(hf_model.model.layers[i].self_attn.o_proj.weight)# = hf_model.model.layers[i].self_attn.o_proj.weight\n", + " # MLP\n", + " ## Gate Up Proj\n", + " tmp_gate_up_proj = torch.cat([\n", + " hf_model.model.layers[i].mlp.gate_proj.weight,\n", + " hf_model.model.layers[i].mlp.up_proj.weight,\n", + " ], dim = 0)\n", + "\n", + " assert tmp_gate_up_proj.shape == nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.shape\n", + " with torch.no_grad():\n", + " nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.copy_(tmp_gate_up_proj)# = tmp_gate_up_proj\n", + " ## Down Proj\n", + " assert hf_model.model.layers[i].mlp.down_proj.weight.shape == nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.shape\n", + " with torch.no_grad():\n", + " nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.copy_(hf_model.model.layers[i].mlp.down_proj.weight)# = hf_model.model.layers[i].mlp.down_proj.weight\n", + "\n", + "\n", + " # Post attn layer norm\n", + " assert hf_model.model.layers[i].post_attention_layernorm.weight.shape == nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.shape\n", + " with torch.no_grad():\n", + " nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.copy_(hf_model.model.layers[i].post_attention_layernorm.weight)# = hf_model.model.layers[i].post_attention_layernorm.weight\n", + " \n", + "# Last layer norm\n", + "assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape\n", + "with torch.no_grad():\n", + " nanotron_model.model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight)# = hf_model.model.norm.weight\n", + "# LM_Head\n", + "assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape\n", + "with torch.no_grad():\n", + " nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight)# = hf_model.lm_head.weight" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from nanotron.data.chat_dataset import ChatDataset\n", + "from nanotron.data.dataloader_builder import build_chat_dataloader\n", + "\n", + "train_dataset = ChatDataset(\n", + " dataset_path=\"Open-Orca/SlimOrca\",\n", + " tokenizer_name_or_path=PATH_TO_LLAMA,\n", + " sequence_length=2048,\n", + " train_on_completions_only=True,\n", + " remove_cross_attention=True,\n", + " split=\"train\",\n", + " conversation_column_name=\"conversations\",\n", + " dp_rank=parallel_context.dp_pg.rank(),\n", + " dp_ranks_size=parallel_context.dp_pg.size(),\n", + ")\n", + "\n", + "# Prepare dataloader\n", + "train_dataloader = build_chat_dataloader(\n", + " dataset=train_dataset,\n", + " sequence_length=2048,\n", + " parallel_context=parallel_context,\n", + " input_pp_rank=0,\n", + " output_pp_rank=0,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "batch = next(iter(train_dataloader))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", + " 128009, 128009, 128009, 128009, 128009, 128009]], dtype=torch.int32)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch[\"input_ids\"][:, -150:]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[128000, 128006, 26380, ..., 13, 128009, 128001]],\n", + " dtype=torch.int32)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch[\"input_ids\"][:, :-150]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LlamaForCausalLM(\n", + " (model): LlamaModel(\n", + " (embed_tokens): Embedding(128256, 4096)\n", + " (layers): ModuleList(\n", + " (0-31): 32 x LlamaDecoderLayer(\n", + " (self_attn): LlamaFlashAttention2(\n", + " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n", + " (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n", + " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", + " (rotary_emb): LlamaRotaryEmbedding()\n", + " )\n", + " (mlp): LlamaMLP(\n", + " (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n", + " (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n", + " (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n", + " (act_fn): SiLU()\n", + " )\n", + " (input_layernorm): LlamaRMSNorm()\n", + " (post_attention_layernorm): LlamaRMSNorm()\n", + " )\n", + " )\n", + " (norm): LlamaRMSNorm()\n", + " (rotary_emb): LlamaRotaryEmbedding()\n", + " )\n", + " (lm_head): Linear(in_features=4096, out_features=128256, bias=False)\n", + ")" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nanotron_model.eval()\n", + "hf_model.eval()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "with torch.no_grad():\n", + " output_nanotron = nanotron_model.model(input_ids=batch[\"input_ids\"][:, :-150].cuda(), position_ids = batch[\"position_ids\"][:, :-150].cuda())" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n", + "PEPEPEPEPE\n" + ] + } + ], + "source": [ + "with torch.no_grad():\n", + " output_hf = hf_model(input_ids=batch[\"input_ids\"][:, :-150].cuda(), position_ids = batch[\"position_ids\"][:, :-150].cuda())" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "Tensor-likes are not close!\n\nMismatched elements: 243083431 / 243429888 (99.9%)\nGreatest absolute difference: 46.65625 at index (0, 1125, 22) (up to 1e-05 allowed)\nGreatest relative difference: 74448896.0 at index (0, 715, 31230) (up to 1.3e-06 allowed)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[38], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtesting\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m assert_close\n\u001b[0;32m----> 3\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_hf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_nanotron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", + "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 243083431 / 243429888 (99.9%)\nGreatest absolute difference: 46.65625 at index (0, 1125, 22) (up to 1e-05 allowed)\nGreatest relative difference: 74448896.0 at index (0, 715, 31230) (up to 1.3e-06 allowed)" + ] + } + ], + "source": [ + "from torch.testing import assert_close\n", + "\n", + "assert_close(output_hf.logits, output_nanotron.transpose(0,1))" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[HF Model] Next token: 11415, probability: 0.10412170737981796\n", + "[HF Model] Next token: 1523, probability: 0.04918361455202103\n", + "[HF Model] Next token: 47032, probability: 0.043404385447502136\n", + "[HF Model] Next token: 72514, probability: 0.03830423951148987\n", + "[HF Model] Next token: 3493, probability: 0.03830423951148987\n", + "[HF Model] Next token: 10477, probability: 0.03830423951148987\n", + "[HF Model] Next token: 16805, probability: 0.03175532445311546\n", + "[HF Model] Next token: 10552, probability: 0.026326090097427368\n", + "[HF Model] Next token: 7664, probability: 0.021825095638632774\n", + "[HF Model] Next token: 3041, probability: 0.018093638122081757\n" + ] + } + ], + "source": [ + "predicted_token = 34\n", + "\n", + "next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1)\n", + "hf_topk_next_tokens= torch.topk(next_tokens_hf, 10)\n", + "\n", + "\n", + "print(*[f\"[HF Model] Next token: {idx.item()}, probability: {prob}\" for idx, prob in zip(hf_topk_next_tokens.indices, hf_topk_next_tokens.values)], sep=\"\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Nanotron Model] Next token: 220, probability: 0.0804644376039505\n", + "[Nanotron Model] Next token: 994, probability: 0.029601214453577995\n", + "[Nanotron Model] Next token: 3639, probability: 0.02612297795712948\n", + "[Nanotron Model] Next token: 656, probability: 0.024540266022086143\n", + "[Nanotron Model] Next token: 279, probability: 0.024540266022086143\n", + "[Nanotron Model] Next token: 3277, probability: 0.021656708791851997\n", + "[Nanotron Model] Next token: 264, probability: 0.013982621021568775\n", + "[Nanotron Model] Next token: 1148, probability: 0.01022990420460701\n", + "[Nanotron Model] Next token: 507, probability: 0.01022990420460701\n", + "[Nanotron Model] Next token: 323, probability: 0.01022990420460701\n" + ] + } + ], + "source": [ + "next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0,1)[0, predicted_token, :], -1)\n", + "nanotron_topk_next_tokens= torch.topk(next_tokens_nanotron, 10)\n", + "\n", + "\n", + "print(*[f\"[Nanotron Model] Next token: {idx.item()}, probability: {prob}\" for idx, prob in zip(nanotron_topk_next_tokens.indices, nanotron_topk_next_tokens.values)], sep=\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save the Nanotron model" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "from nanotron.parallel.parameters import sanity_check\n", + "\n", + "sanity_check(root_module=nanotron_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Saving weights: 100%|██████████| 195/195 [00:41<00:00, 4.67it/s]\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "from nanotron.serialize import save_meta, save_weights, TrainingMetadata\n", + "from nanotron.serialize.metadata import DataStageMetadata\n", + "\n", + "out_path = \"/mloscratch/homes/solergib/converter/nanotron/n_c/first/\"\n", + "out_path = Path(out_path)\n", + "\n", + "save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=out_path)\n", + "\n", + "training_metadata = TrainingMetadata(last_train_step=0, consumed_train_samples=0, data_stages=[DataStageMetadata(name=\"Empty\", consumed_train_samples=0, start_training_step=0)])\n", + "\n", + "save_meta(root_folder=out_path, parallel_context=parallel_context, training_metadata=training_metadata)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving config ...\n", + "Saving model config ...\n" + ] + } + ], + "source": [ + "import json \n", + "import yaml\n", + "from nanotron.config import GeneralArgs, ModelArgs, TokenizerArgs, Config\n", + "from nanotron.config.models_config import ExistingCheckpointInit\n", + "from dataclasses import asdict\n", + "\n", + "with open(out_path / \"config.yaml\", \"w\") as f:\n", + " config = Config(\n", + " general=GeneralArgs(project=\"conversion\", run=\"Llama3-8B\"),\n", + " parallelism=parallel_config,\n", + " model=ModelArgs(\n", + " init_method=ExistingCheckpointInit(out_path),\n", + " model_config=nanotron_config,\n", + " ),\n", + " tokenizer=TokenizerArgs(PATH_TO_LLAMA),\n", + " )\n", + " print(\"Saving config ...\")\n", + " yaml.dump(config.as_dict(), f)\n", + "\n", + "with open(out_path / \"model_config.json\", \"w\") as f:\n", + " print(\"Saving model config ...\")\n", + " json.dump(asdict(nanotron_config), f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/mloscratch/homes/solergib/SFT/transformers/src/transformers/deepspeed.py:24: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'input_ids': tensor([[27, 22, 0, 97, 13, 49, 56, 35, 70, 91, 38, 30, 26, 94, 68, 46, 89, 32,\n", + " 70, 85, 50, 67, 70, 86, 66, 82, 18, 72, 27, 37, 91, 27, 60, 57, 23, 93,\n", + " 10, 80, 82, 26, 13, 50, 12, 68, 63, 85, 55, 1, 3, 61, 37, 70, 12, 97,\n", + " 1, 59, 90, 45, 74, 62, 66, 54, 94, 18, 54, 89, 49, 3, 66, 55]],\n", + " device='cuda:0'), 'position_ids': tensor([[0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2,\n", + " 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5,\n", + " 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6]],\n", + " device='cuda:0')}\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append(\"/mloscratch/homes/solergib/SFT/transformers\")\n", + "\n", + "import torch\n", + "from t_tests.models.llama.test_modeling_llama import LlamaModelTester\n", + "\n", + "lmt = LlamaModelTester(parent=None)\n", + "\n", + "_, inputs_dict = lmt.prepare_config_and_inputs_for_common()\n", + "dummy_attention_mask = inputs_dict[\"attention_mask\"]\n", + "inputs_dict[\"input_ids\"][~dummy_attention_mask.bool()] = 0\n", + "\n", + "padfree_inputs_dict = {\n", + " k: v[dummy_attention_mask.bool()].unsqueeze(0)\n", + " for k, v in inputs_dict.items()\n", + " if not k == \"attention_mask\"\n", + "}\n", + "\n", + "padfree_inputs_dict[\"position_ids\"] = (\n", + " torch.cat([torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()])\n", + " .long()\n", + " .unsqueeze(0)\n", + " .to(\"cuda\")\n", + ")\n", + "\n", + "print(padfree_inputs_dict)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/config_llama_sft.yaml b/examples/config_llama_sft.yaml new file mode 100644 index 00000000..d65f7683 --- /dev/null +++ b/examples/config_llama_sft.yaml @@ -0,0 +1,97 @@ +checkpoints: + checkpoint_interval: 1000 + checkpoints_path: /mloscratch/homes/solergib/converter/nanotron/checkpoints + checkpoints_path_is_shared_file_system: false + resume_checkpoint_path: null + save_initial_state: false +data_stages: +- data: + dataset: + hf_dataset: Open-Orca/SlimOrca + hf_dataset_split: train + conversation_column_name: conversations + train_on_completions_only: true + remove_cross_attention: true + num_loading_workers: 1 + seed: 42 + name: General purpose training (Single dataset) + start_training_step: 1 +general: + benchmark_csv_path: null + consumed_train_samples: null + ignore_sanity_checks: true + project: Chat + run: Llama3-8B + seed: 42 + step: null +lighteval: null +logging: + iteration_step_info_interval: 1 + log_level: info + log_level_replica: info +model: + ddp_bucket_cap_mb: 25 + dtype: bfloat16 + init_method: + std: 0.025 + make_vocab_size_divisible_by: 1 + model_config: + bos_token_id: 128000 + eos_token_id: 128001 + hidden_act: silu + hidden_size: 4096 + initializer_range: 0.02 + intermediate_size: 14336 + is_llama_config: true + max_position_embeddings: 4096 + num_attention_heads: 32 + num_hidden_layers: 4 + num_key_value_heads: 8 + pad_token_id: null + pretraining_tp: 1 + rms_norm_eps: 1.0e-05 + rope_scaling: null + rope_theta: 500000.0 + tie_word_embeddings: false + use_cache: true + vocab_size: 128256 +optimizer: + accumulate_grad_in_fp32: true + clip_grad: 1.0 + learning_rate_scheduler: + learning_rate: 0.0003 + lr_decay_starting_step: null + lr_decay_steps: 98 + lr_decay_style: cosine + lr_warmup_steps: 2 + lr_warmup_style: linear + min_decay_lr: 1.0e-05 + optimizer_factory: + adam_beta1: 0.9 + adam_beta2: 0.95 + adam_eps: 1.0e-08 + name: adamW + torch_adam_is_fused: true + weight_decay: 0.01 + zero_stage: 0 +parallelism: + dp: 1 + expert_parallel_size: 1 + pp: 1 + pp_engine: 1f1b + tp: 1 + tp_linear_async_communication: false + tp_mode: ALL_REDUCE +profiler: null +tokenizer: + tokenizer_max_length: null + tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct + tokenizer_revision: null +tokens: + batch_accumulation_per_replica: 1 + limit_test_batches: 0 + limit_val_batches: 0 + micro_batch_size: 3 + sequence_length: 4096 + train_steps: 100 + val_check_interval: -1 diff --git a/run_train.py b/run_train.py index 021d955d..60f01373 100644 --- a/run_train.py +++ b/run_train.py @@ -12,8 +12,9 @@ import numpy as np from nanotron import logging -from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs -from nanotron.data.dataloader_builder import build_nanoset_dataloader +from nanotron.config import ChatDatasetsArgs, DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs +from nanotron.data.chat_dataset import ChatDataset +from nanotron.data.dataloader_builder import build_chat_dataloader, build_nanoset_dataloader from nanotron.dataloader import ( clm_process, dummy_infinite_data_generator, @@ -171,6 +172,31 @@ def get_dataloader_from_data_stage( dataloader_drop_last=True, ) + return train_dataloader + # Case 4: Chat Datasets + elif isinstance(data.dataset, ChatDatasetsArgs): + with main_rank_first(trainer.parallel_context.world_pg): + train_dataset = ChatDataset( + dataset_path=data.dataset.hf_dataset, + tokenizer_name_or_path=trainer.config.tokenizer.tokenizer_name_or_path, + sequence_length=trainer.sequence_length, + train_on_completions_only=data.dataset.train_on_completions_only, + remove_cross_attention=data.dataset.remove_cross_attention, + split=data.dataset.hf_dataset_split, + conversation_column_name=data.dataset.conversation_column_name, + dp_rank=trainer.parallel_context.dp_pg.rank(), + dp_ranks_size=trainer.parallel_context.dp_pg.size(), + ) + + # Prepare dataloader + train_dataloader = build_chat_dataloader( + dataset=train_dataset, + sequence_length=trainer.sequence_length, + parallel_context=trainer.parallel_context, + input_pp_rank=input_pp_rank, + output_pp_rank=output_pp_rank, + ) + return train_dataloader else: raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}") diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py index 05b49955..96337e9a 100644 --- a/src/nanotron/config/config.py +++ b/src/nanotron/config/config.py @@ -107,11 +107,27 @@ def __post_init__(self): self.dataset_weights = list(tmp_dataset_folder.values()) +@dataclass +class ChatDatasetsArgs: + hf_dataset: str + hf_dataset_split: str + conversation_column_name: str + # Debug + train_on_completions_only: bool = True + remove_cross_attention: bool = True + + def __post_init__(self): + if self.hf_dataset_split is None: + self.hf_dataset_split = "train" + if self.conversation_column_name is None: + self.conversation_column_name = "conversations" + + @dataclass class DataArgs: """Arguments related to the data and data files processing""" - dataset: Union[PretrainDatasetsArgs, NanosetDatasetsArgs] + dataset: Union[PretrainDatasetsArgs, NanosetDatasetsArgs, ChatDatasetsArgs] seed: Optional[int] num_loading_workers: Optional[int] = 1 diff --git a/src/nanotron/data/chat_dataset.py b/src/nanotron/data/chat_dataset.py new file mode 100644 index 00000000..ac46ba42 --- /dev/null +++ b/src/nanotron/data/chat_dataset.py @@ -0,0 +1,139 @@ +from typing import List + +import numpy as np +from datasets import load_dataset +from datasets.distributed import split_dataset_by_node +from nanotron.data.chat_tokenizer import ChatTokenizer +from nanotron.data.collator import ( + build_labels, + build_labels_completions_only, + build_position_ids, + build_position_ids_dummy, +) +from torch.utils.data import IterableDataset +from transformers import AutoTokenizer + + +class ChatDataset(IterableDataset): + """ + Chat Dataset for training models with: + 1. Packing + 2. No cross-contamination between packed samples + 3. Train on completitions only + + Args: + dataset_path (str): Path to the dataset in the file system. If provided, data will be loaded from this path instead of downloaded. + tokenizer_name_or_path (str): Path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub. + seq_len (int): max sequence length + train_on_completions_only (bool): Whether to just train on completitions or not. To be deleted + remove_cross_attention (bool): Whether to just attend to the tokens from the same sample or to all (Vanilla mechanism). To be deleted + split (str): Split of the dataset to train on + conversation_column_name (str): Column name of the dataset containing the conversations + dp_rank (int): rank of the current data parallel process + dp_ranks_size (int): number of data parallel processes participating in training + """ + + def __init__( + self, + dataset_path: str, + tokenizer_name_or_path, + sequence_length: int, + conversation_column_name: str, + train_on_completions_only: bool = True, + remove_cross_attention: bool = True, + split: str = "train", + dp_rank: int = 0, + dp_ranks_size: int = 1, + skip_num_samples: int = None, # TODO Delete, check later comment + seed: int = 1234, + ) -> None: + + # TODO: Support checkpointing for resuming training. We have to store the number of consumed samples from the dataset (Which is different from the number of steps) and the buffers. + # skip_num_samples will fail, as it's computed with the number of steps and as we are packing sequences we might have consumed MORE samples from the dataset + # TODO: Support interleaving datasets + + self.dataset_path = dataset_path + self.chat_tokenizer = ChatTokenizer(tokenizer_name_or_path) + self.sequence_length = sequence_length + self.conversation_column_name = conversation_column_name + self.skip_num_samples = skip_num_samples + self.seed = seed + + # Load, split and shuffle dataset. Also skip samples if resuming training. + self.dataset = load_dataset(dataset_path, split=split, streaming=True) + self.dataset = split_dataset_by_node(self.dataset, dp_rank, dp_ranks_size) + self.dataset = self.dataset.shuffle(seed=seed, buffer_size=10_000) + + # TODO delete, just 4 switching the training only on completitions setting + if train_on_completions_only: + self.create_labels = build_labels_completions_only + else: + self.create_labels = build_labels + + # TODO delete, just 4 switching the remove cross-attention setting + if remove_cross_attention: + self.create_position_ids = build_position_ids + else: + self.create_position_ids = build_position_ids_dummy + + # Todo delete (debug), just change the dict keys + self.debug_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) # TODO delete debug + self.debug_tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n'+ message['value'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>' }}{% endif %}" + + def __iter__(self): + max_buffer_token_len = 1 + self.sequence_length + buffer_tokens: List[int] = [] + buffer_is_completition: List[int] = [] + buffer_lengths: List[int] = [] + + while True: + for sample in iter(self.dataset): + tokens, is_completition = self.chat_tokenizer(sample[self.conversation_column_name]) + + # TODO assert that tokenized conversations are not longer than max_buffer_token_len? + + # TODO delete (debug). The [:-1] of tokens is because apply chat template doesn't adds eos (NOT eot) token + assert ( + self.debug_tokenizer.apply_chat_template(sample["conversations"]) == tokens[:-1] + ), f'{self.debug_tokenizer.apply_chat_template(sample["conversations"])}\n\n{tokens[:-1]}' + + buffer_tokens.extend(tokens) + buffer_is_completition.extend(is_completition) + buffer_lengths.append(len(tokens)) + + if len(buffer_tokens) > max_buffer_token_len: # Can't pack more samples, yield + # Pop last sample from buffers + sample_tokens = buffer_tokens[: -len(tokens)] + sample_completitions = buffer_is_completition[: -len(tokens)] + sample_lengths = buffer_lengths[:-1] + + # TODO delete (debug) + assert len(sample_tokens) == len(sample_completitions) == sum(sample_lengths) + + # Reset tokens buffers + buffer_tokens = tokens.copy() + buffer_is_completition = is_completition.copy() + buffer_lengths = [len(tokens)] + + # Pad to max_buffer_token_len. Pad token added in ChatTokenizer init if necessary + sample_tokens.extend( + [self.chat_tokenizer.tokenizer.pad_token_id] * (max_buffer_token_len - len(sample_tokens)) + ) + sample_completitions.extend([False] * (max_buffer_token_len - len(sample_completitions))) + + # TODO delete, just 4 switching the training only on completitions setting + labels = self.create_labels(sample_tokens, sample_completitions) + + # TODO delete, just 4 switching the remove cross-attention setting + position_ids = self.create_position_ids(sample_lengths, self.sequence_length) + + # TODO delete (debug) + assert len(sample_tokens) == max_buffer_token_len + + yield { + "input_ids": np.array(sample_tokens[:-1], dtype=np.int32), + "label_ids": labels, + "position_ids": position_ids, + } + + print("Consumed all samples, dataset is being re-looped.") diff --git a/src/nanotron/data/chat_tokenizer.py b/src/nanotron/data/chat_tokenizer.py new file mode 100644 index 00000000..847a365f --- /dev/null +++ b/src/nanotron/data/chat_tokenizer.py @@ -0,0 +1,81 @@ +from typing import List, Tuple + +from transformers import AutoTokenizer + + +class ChatTokenizer: + """ + The ChatTokenizer encodes a conversation applying the Llama3 Chat Template and returns the role (Either User or Assistant) of each token + + Args: + tokenizer_name_or_path (str): A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub. + """ + + def __init__(self, tokenizer_name_or_path: str): + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) + + # Add pad token if necessary + if self.tokenizer.pad_token is None: + self.tokenizer.add_special_tokens({"pad_token": "<|eot_id|>"}) + + def __call__(self, conversation: List[dict]) -> Tuple[List[int], List[bool]]: + """ + Applies the Llama3 chat template, encodes the conversation and returns the tokens along with a bool value for each token whether if the token belongs to the answer of the assistant or not to be able to just train on the assistant answers + Args: + conversation (List[dict]): List of dicts where each dict contains the "from" key to specify the emisor del mensaje and the "value" key with the message. + Same format as SlimOrca dataset with possible from values: "System", "human" and "gpt" + Example: + conversation: [ { "from": "system", "value": "You are an AI assistant that follows instruction extremely well. Help as much as you can."}, + { "from": "human", "value": "Answer the following question: - number is 54 - debutteam is pittsburgh steelers - draftpick is 166 - birth date is 24 may 1982 - weight is 243 - nfl is wal475737 - debutyear is 2005 - finalteam is new york sentinels - statlabel is tackles sacks interceptions - heightin is 3 - statvalue is 9 0.0 1 - heightft is 6 - college is temple - birth place is pottstown , pennsylvania - draftyear is 2005 - position is linebacker - draftround is 5 - finalyear is 2009 Given the details above, guess who could this information be about.\nAnswer:"}, + { "from": "gpt", "value": "The information provided seems to refer to Rian Wallace, a former NFL player."} ] + + After applying chat template: + <|begin_of_text|><|start_header_id|>system<|end_header_id|> + + You are an AI assistant that follows instruction extremely well. Help as much as you can.<|eot_id|><|start_header_id|>human<|end_header_id|> + + Answer the following question: - number is 54 - debutteam is pittsburgh steelers - draftpick is 166 - birth date is 24 may 1982 - weight is 243 - nfl is wal475737 - debutyear is 2005 - finalteam is new york sentinels - statlabel is tackles sacks interceptions - heightin is 3 - statvalue is 9 0.0 1 - heightft is 6 - college is temple - birth place is pottstown , pennsylvania - draftyear is 2005 - position is linebacker - draftround is 5 - finalyear is 2009 Given the details above, guess who could this information be about. + Answer:<|eot_id|><|start_header_id|>gpt<|end_header_id|> + + The information provided seems to refer to Rian Wallace, a former NFL player.<|eot_id|> + returns: + tokens (List[int]): A list of tokens e.g. [128000, 128006, 9125, 128007, 271, 2675, 527, ..., 12873, 2851, 13, 128009, 128001] + is_completitions (List[bool]): A list of bools whether the tokens belong to the assistant answer or not e.g. [False, False, False, ..., False, True, True, True, True] + """ + tokens = [] + # Append <|begin_of_text|> + tokens.append(self.tokenizer.bos_token_id) + is_completitions = [False] * len(tokens) + + for message in conversation: + message_tokens, message_completitions = self.encode_message(message) + tokens.extend(message_tokens) + is_completitions.extend(message_completitions) + + # Append <|end_of_text|> token + tokens.extend(self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)) + is_completitions.append(True) + + return tokens, is_completitions + + def encode_message(self, message: dict) -> Tuple[List[int], List[int]]: + # TODO The "from", "value", "gpt" keys are form SlimOrca Dataset. Llama3 uses another ones. We should stick to a + # single format and document it properly rather than supporting multiple formats, as each one will need a different + # ChatTokenizer and the idea is that all Datasets share the same ChatTokenizer + + # Encode header + tokens = self.tokenizer.encode( + f"<|start_header_id|>{message['from']}<|end_header_id|>\n\n", add_special_tokens=False + ) + is_completitions = [False] * len(tokens) + + # Encode message + tokens.extend(self.tokenizer.encode(message["value"].strip(), add_special_tokens=False)) + + # Append <|eot_id|> token + tokens.extend(self.tokenizer.encode("<|eot_id|>", add_special_tokens=False)) + + # True if token belongs to assistant answer, False otherwise + is_completitions.extend([True if message["from"] == "gpt" else False] * (len(tokens) - len(is_completitions))) + + return tokens, is_completitions diff --git a/src/nanotron/data/collator.py b/src/nanotron/data/collator.py index 199527e1..b34a7369 100644 --- a/src/nanotron/data/collator.py +++ b/src/nanotron/data/collator.py @@ -1,4 +1,4 @@ -import dataclasses +from dataclasses import dataclass from typing import Dict, List, Union import numpy as np @@ -8,7 +8,7 @@ from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -@dataclasses.dataclass +@dataclass class NanosetDataCollatorForCLM: """ Data collator used for causal language modeling with Nanosets dataset. @@ -78,3 +78,88 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni ) return result + + +# TODO Find a more elegant way. e.g. extend instead of append. OK, so no extend +# We could compute position ids after tokenizing each sample but we will still miss the last length of the padding tokens +def build_position_ids(lengths, sequence_length) -> np.array: + position_ids = [list(range(length)) for length in lengths] # Create position ids list + position_ids.append([0] * (sequence_length - sum(lengths))) # Append position_ids of the padding tokens + return np.array([x for xs in position_ids for x in xs], dtype=np.int32) # Flatten list of position ids + + +# TODO delete, just 4 switching the remove cross-attention setting +def build_position_ids_dummy(lengths, sequence_length) -> np.array: + return np.array(list(range(sequence_length)), dtype=np.int32) # TODO numpy arange + + +# TODO delete, just 4 switching the training only on completitions setting. This will be in the __iter__ method instead of a function +def build_labels_completions_only(input_ids, is_completitions): + labels = np.where( + is_completitions, input_ids, -100 + ) # Mask tokens that don't belong to the completitions by the Assistant + return np.array(labels[1:], dtype=np.int32) + + +# TODO delete, just 4 switching the training only on completitions setting +def build_labels(input_ids, is_completitions): + return np.array(input_ids[1:], dtype=np.int32) + + +@dataclass +class NanoChatDataCollatorForSFT: # TODO(tj.solergibert) Find a better name + """ + Data collator used with Chat Dataset. + - sequence_length: Sequence length of each sample in the batch + - input_pp_rank: Discards last input id token + - output_pp_rank: Discards first label id token + - other pp ranks: Don't have data. Instead, we use `TensorPointer` to point to the rank having the data. + """ + + sequence_length: int + input_pp_rank: int + output_pp_rank: int + parallel_context: ParallelContext + + def __call__(self, examples: List[Dict[str, List[int]]]) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + # Process the case when current rank doesn't require data. We return `TensorPointer` that points to ranks having the data. + + current_pp_rank = dist.get_rank(self.parallel_context.pp_pg) + if current_pp_rank not in [ + self.input_pp_rank, + self.output_pp_rank, + ]: + assert all(len(example) == 0 for example in examples) + return { + "input_ids": TensorPointer(group_rank=self.input_pp_rank), + "input_mask": TensorPointer(group_rank=self.input_pp_rank), + "label_ids": TensorPointer(group_rank=self.output_pp_rank), + "label_mask": TensorPointer(group_rank=self.output_pp_rank), + } + + # TODO clean this, as we are flatting the batch there is no necessity for vstack but we need the batch dimension too + input_ids = np.vstack([examples[i]["input_ids"] for i in range(len(examples))]) # (b, s) + label_ids = np.vstack([examples[i]["label_ids"] for i in range(len(examples))]) # (b, s) + position_ids = np.vstack([examples[i]["position_ids"] for i in range(len(examples))]) # (b, s) + + result: Dict[str, Union[np.ndarray, TensorPointer]] = {} + + result["input_ids"] = TensorPointer(group_rank=self.input_pp_rank) + result["input_mask"] = TensorPointer(group_rank=self.input_pp_rank) + result["label_ids"] = TensorPointer(group_rank=self.output_pp_rank) + result["label_mask"] = TensorPointer(group_rank=self.output_pp_rank) + + # Process inputs + if current_pp_rank == self.input_pp_rank: + result["input_ids"] = input_ids + result["input_mask"] = np.ones((1, self.sequence_length), dtype=np.bool_) + result["position_ids"] = position_ids + + # Process labels: shift them to the left + if current_pp_rank == self.output_pp_rank: + result["label_ids"] = label_ids + result["label_mask"] = np.ones((1, self.sequence_length), dtype=np.bool_) + + # Cast np.array to torch.Tensor + result = {k: v if isinstance(v, TensorPointer) else torch.from_numpy(v) for k, v in result.items()} + return result diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py index 9d3285f6..f63237ad 100644 --- a/src/nanotron/data/dataloader_builder.py +++ b/src/nanotron/data/dataloader_builder.py @@ -1,6 +1,6 @@ import nanotron.distributed as dist from nanotron import logging -from nanotron.data.collator import NanosetDataCollatorForCLM +from nanotron.data.collator import NanoChatDataCollatorForSFT, NanosetDataCollatorForCLM from nanotron.dataloader import ( EmptyInfiniteDataset, get_dataloader_worker_init, @@ -62,3 +62,36 @@ def build_nanoset_dataloader( pin_memory=dataloader_pin_memory, worker_init_fn=get_dataloader_worker_init(dp_rank=dp_rank), ) + + +def build_chat_dataloader( + dataset, + sequence_length: int, + parallel_context: ParallelContext, + input_pp_rank: int, + output_pp_rank: int, + dataloader_pin_memory: bool = True, +) -> DataLoader: + + # Case of ranks not requiring data. We give them a dummy dataset, then the collator will do his job + if dist.get_rank(parallel_context.pp_pg) not in [input_pp_rank, output_pp_rank]: + dataset_length = 1_000_000 # len(dataset) TODO find a more elegant way to specify this dummy dataset + dataset = EmptyInfiniteDataset(length=dataset_length) + + data_collator = NanoChatDataCollatorForSFT( + sequence_length=sequence_length, + input_pp_rank=input_pp_rank, + output_pp_rank=output_pp_rank, + parallel_context=parallel_context, + ) + + dp_rank = parallel_context.dp_pg.rank() + + return DataLoader( + dataset, + batch_size=1, + collate_fn=data_collator, + num_workers=0, + pin_memory=dataloader_pin_memory, + worker_init_fn=get_dataloader_worker_init(dp_rank=dp_rank), + ) diff --git a/src/nanotron/models/llama_sft.py b/src/nanotron/models/llama_sft.py new file mode 100644 index 00000000..a7ccb9d2 --- /dev/null +++ b/src/nanotron/models/llama_sft.py @@ -0,0 +1,888 @@ +# coding=utf-8 +# Copyright 2018 HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch LLaMa model.""" + +from typing import Dict, Optional, Union + +import torch +from flash_attn import flash_attn_varlen_func +from torch import nn + +from nanotron import distributed as dist +from nanotron import logging +from nanotron.config import Config, LlamaConfig, ParallelismArgs +from nanotron.config.models_config import RandomInit, SpectralMupInit +from nanotron.generation.generate_store import AttachableStore +from nanotron.logging import log_rank +from nanotron.models import NanotronModel +from nanotron.nn.activations import ACT2FN +from nanotron.nn.layer_norm import TritonRMSNorm +from nanotron.parallel import ParallelContext +from nanotron.parallel.parameters import NanotronParameter +from nanotron.parallel.pipeline_parallel.block import PipelineBlock, TensorPointer +from nanotron.parallel.pipeline_parallel.p2p import P2P +from nanotron.parallel.tensor_parallel.functional import sharded_cross_entropy +from nanotron.parallel.tensor_parallel.nn import ( + TensorParallelColumnLinear, + TensorParallelEmbedding, + TensorParallelLinearMode, + TensorParallelRowLinear, +) +from nanotron.random import RandomStates +from nanotron.scaling.parametrization import SpectralMupParametrizator, StandardParametrizator +from nanotron.utils import checkpoint_method + +logger = logging.get_logger(__name__) + + +####### +# NOTE(tj.solergibert) Copied from https://github.com/huggingface/transformers/blob/81233c069c166af033794134bd8888783ac49ebe/src/transformers/modeling_rope_utils.py#L29 +def _compute_default_rope_parameters( + config: LlamaConfig, +) -> torch.Tensor: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config (LlamaConfig): + The model configuration. + Returns: + inv_freq (torch.Tensor) + Contains the inverse frequencies for the RoPE embeddings + """ + + base = config.rope_theta # NOTE(tj.solergibert) 500000.0 + partial_rotary_factor = ( + config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + ) # NOTE(tj.solergibert) 1 + dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) # NOTE(tj.solergibert) 128 + + # Compute the inverse frequencies + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) + return inv_freq + + +# NOTE(tj.solergibert) Copied from https://github.com/huggingface/transformers/blob/5f841c74b62754f186a8c06a684d491524b7bc03/src/transformers/models/llama/modeling_llama.py#L81 +# NOTE(tj.solergibert) FlashAttention RoPEs are faster (triton), but currently they don't support position_ids +# NOTE(tj.solergibert) This function is just called once per batch to compute the position_embeddings, the expensive operation +# is def apply_rotary_pos_emb +class LlamaRotaryEmbedding(nn.Module): + def __init__( + self, + config: LlamaConfig, + ): + super().__init__() + self.config = config + + inv_freq = _compute_default_rope_parameters(self.config) # NOTE(tj.solergibert) shape: 64 , 1.0 + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + def forward(self, x, position_ids): + # Core RoPE block + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# NOTE(tj.solergibert) FlashAttention RoPEs are faster (triton), but currently they don't support position_ids +def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (torch.Tensor): The query tensor. + k (torch.Tensor): The key tensor. + cos (torch.Tensor): The cosine part of the rotary embedding. + sin (torch.Tensor): The sine part of the rotary embedding. + unsqueeze_dim (int, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + tuple (torch.Tensor) comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) # NOTE(tj.solergibert) [1, 70, 128] --> [1, 1, 70, 128] + sin = sin.unsqueeze(unsqueeze_dim) # NOTE(tj.solergibert) + q_embed = (q * cos) + (rotate_half(q) * sin) # NOTE(tj.solergibert) [1, 32, 70, 128] + k_embed = (k * cos) + (rotate_half(k) * sin) # NOTE(tj.solergibert) [1, 8, 70, 128] + return q_embed, k_embed + + +def prepare_varlen_args(position_ids): + position_ids = position_ids.flatten() + indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32) + cu_seqlens = torch.cat( + ( + indices_q[position_ids == 0], + torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32), + ) + ) + + max_seqlen_in_batch = position_ids.max() + 1 + + return cu_seqlens, max_seqlen_in_batch + + +####### + + +class GLUActivation(nn.Module): + def __init__(self, act_fn_name: str): + super().__init__() + self.act = ACT2FN[act_fn_name] + + def forward(self, merged_states: torch.Tensor): + gate_states, up_states = torch.split(merged_states, merged_states.shape[-1] // 2, dim=-1) + return self.act(gate_states) * up_states + + +class MLP(nn.Module): + def __init__( + self, + config: LlamaConfig, + parallel_config: Optional[ParallelismArgs], + tp_pg: dist.ProcessGroup, + ): + super().__init__() + + # TODO @thomasw21: refactor so that we store that default in a single place. + tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE + tp_linear_async_communication = ( + parallel_config.tp_linear_async_communication if parallel_config is not None else False + ) + + gate_up_contiguous_chunks = ( + config.intermediate_size, # shape of gate_linear + config.intermediate_size, # shape of up_linear + ) + self.gate_up_proj = TensorParallelColumnLinear( + config.hidden_size, + 2 * config.intermediate_size, + pg=tp_pg, + mode=tp_mode, + bias=False, + async_communication=tp_linear_async_communication, + contiguous_chunks=gate_up_contiguous_chunks, + ) + self.down_proj = TensorParallelRowLinear( + config.intermediate_size, + config.hidden_size, + pg=tp_pg, + mode=tp_mode, + bias=False, + async_communication=tp_linear_async_communication and tp_mode is TensorParallelLinearMode.REDUCE_SCATTER, + ) + # TODO @nouamane: why can't we torch.jit.script GLUActivation? + self.split_silu_mul = GLUActivation(config.hidden_act) + + def forward(self, hidden_states): # [seq_length, batch_size, hidden_dim] + merged_states = self.gate_up_proj(hidden_states) + hidden_states = self.down_proj(self.split_silu_mul(merged_states)) + return hidden_states + + +class CoreAttention(nn.Module): + def __init__(self, config: LlamaConfig, parallel_config: Optional[ParallelismArgs], layer_idx: int): + super().__init__() + # TODO @thomasw21: GPT has a weird `d_kv` config which I'm guessing is essentically a `d_qkv` + assert ( + config.hidden_size % config.num_attention_heads == 0 + ), f"Hidden size {config.hidden_size} must be divisible by number of attention heads {config.num_attention_heads}." + self.d_qk = config.hidden_size // config.num_attention_heads + self.d_v = config.hidden_size // config.num_attention_heads + self.is_using_mup = config.is_using_mup + + self.checkpoint_attention = False # Because flash_attn already does checkpointing + + @checkpoint_method(attr_name="checkpoint_attention") + def forward( + self, + query_states: torch.Tensor, # [batch_size, q_length, n_local_q_heads, inner_dim] + key_states: torch.Tensor, # [batch_size, kv_length, n_local_kv_heads, inner_dim] + value_states: torch.Tensor, # [batch_size, kv_length, n_local_kv_heads, inner_dim] + ): + from flash_attn.flash_attn_interface import flash_attn_func + + # NOTE: this scale is for µTransfer, + # in SP, we use sqrt(1/d_h) + softmax_scale = 1 / query_states.shape[-1] if self.is_using_mup else None + # For now we are assuming that we use causual mask. No magic here + causal = True + attn_output = flash_attn_func( + q=query_states, + k=key_states, + v=value_states, + dropout_p=0.0, + softmax_scale=softmax_scale, + causal=causal, + return_attn_probs=False, + ) + + return attn_output + + +class CausalSelfAttention(nn.Module, AttachableStore): + def __init__( + self, + config: LlamaConfig, + parallel_config: Optional[ParallelismArgs], + tp_pg: dist.ProcessGroup, + layer_idx: int, + ): + + super().__init__() + # Tensor parallel considerations: We split tensors along head dimension + assert ( + config.num_attention_heads % tp_pg.size() == 0 + ), f"Number of attention heads ({config.num_attention_heads}) must be divisible by TP size ({tp_pg.size()})." + try: + assert ( + config.num_key_value_heads % tp_pg.size() == 0 + ), f"Number of key/value heads ({config.num_key_value_heads}) must be divisible by TP size ({tp_pg.size()})." + except AttributeError: + log_rank( + "WARNING: num_key_value_heads not defined, assuming it is equal to num_attention_heads", + logger=logger, + level=logging.WARNING, + rank=0, + ) + # If num_key_value_heads is not defined, we assume that it is equal to num_attention_heads + config.num_key_value_heads = config.num_attention_heads + assert ( + config.num_attention_heads % config.num_key_value_heads == 0 + ), f"Number of attention heads ({config.num_attention_heads}) must be divisible by number of key/value heads ({config.num_key_value_heads})." + self.n_local_q_heads = config.num_attention_heads // tp_pg.size() + self.n_local_kv_heads = config.num_key_value_heads // tp_pg.size() + self.n_repeats = config.num_attention_heads // config.num_key_value_heads + self.is_gqa = config.num_attention_heads != config.num_key_value_heads # Whether we are using GQA or not + self.d_qk = config.hidden_size // config.num_attention_heads + self.d_v = config.hidden_size // config.num_attention_heads + self.d_model = config.hidden_size + self.is_using_mup = config.is_using_mup + + # TODO @thomasw21: refactor so that we store that default in a single place. + tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE + tp_linear_async_communication = ( + parallel_config.tp_linear_async_communication if parallel_config is not None else False + ) + + # build the slice config for self.qkv for save/load + # shard are done within the contiguous chunk + qkv_contiguous_chunks = ( + config.num_attention_heads * self.d_qk, # shape of q + config.num_key_value_heads * self.d_qk, # shape of k + config.num_key_value_heads * self.d_qk, # shape of v + ) + self.qkv_proj = TensorParallelColumnLinear( + self.d_model, + config.num_attention_heads * self.d_qk + 2 * config.num_key_value_heads * self.d_qk, + pg=tp_pg, + mode=tp_mode, + bias=False, + async_communication=tp_linear_async_communication, + contiguous_chunks=qkv_contiguous_chunks, + ) + + self.o_proj = TensorParallelRowLinear( + config.num_attention_heads * self.d_qk, + self.d_model, + pg=tp_pg, + mode=tp_mode, + bias=False, + async_communication=tp_linear_async_communication, + ) + + # TODO(tj.solergibert) Deshacernos de este bloque POR DIOS!!! + self.attention = CoreAttention( + config, + parallel_config=parallel_config, + layer_idx=layer_idx, + ) + + def forward( + self, + hidden_states, # [seq_length, batch_size, hidden_size] + position_ids, # [batch_size, seq_length] + cos, # [batch_size, seq_length, hidden_size//num_attention_heads] + sin, # [batch_size, seq_length, hidden_size//num_attention_heads] + ): + qkv_states = self.qkv_proj( + hidden_states + ) # [seq_length, batch_size, n_local_q_heads * d_qk + 2 * n_local_kv_heads * d_qk] + q_length, batch_size, _ = qkv_states.shape + + if self.is_gqa: + query_states, key_states, value_states = torch.split( + qkv_states, + [ + self.n_local_q_heads * self.d_qk, + self.n_local_kv_heads * self.d_qk, + self.n_local_kv_heads * self.d_qk, + ], + dim=-1, + ) + + query_states = ( + query_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_q_heads, self.d_qk) + ) + key_states = ( + key_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) + ) + value_states = ( + value_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) + ) + else: + query_states, key_states, value_states = ( + qkv_states.view(q_length, batch_size, 3, self.n_local_q_heads, self.d_qk) + .permute(2, 1, 0, 3, 4) + .contiguous() + ) # [3, batch_size, seq_length, n_local_q_heads, d_qk] + + # Training case OLD + # Apply rotary embeddings to query/key states + # NOTE: The layout is different from models/llama.py which is [batch_size, num_heads, seq_length, d_qk] + # Here it is, [batch_size, seq_length, num_heads, d_qk] + # [2, batch_size, seq_length, num_heads, d_qk] + # key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) + # [batch_size, seq_length, 2, num_heads, d_qk] + # key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() + # query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=key_value_states) + # [batch_size, seq_length, num_heads, d_qk] + # key_states, value_states = torch.split(key_value_states, 1, dim=2) + + # TODO(tj.solergibert) ver si esto sirve de algo o no!!!!! + # kv_length = key_states.shape[1] + # key_states = key_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_qk) + # value_states = value_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_v) + + # attention_output = self.attention( + # query_states=query_states, + # key_states=key_states, + # value_states=value_states, + # ) + + # TODO(tj.solergibert) Apply RoPE embeddings WITHOUT too many transpose... + query_states, key_states = query_states.transpose(1, 2), key_states.transpose(1, 2) + # Apply RoPE + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + query_states, key_states = query_states.transpose(1, 2), key_states.transpose(1, 2) + + # Prepare varlen args + cu_seqlens, max_seqlen_in_batch = prepare_varlen_args(position_ids) + print(cu_seqlens) + print(max_seqlen_in_batch) + query_states = query_states.view(-1, query_states.size(-2), query_states.size(-1)) + key_states = key_states.view(-1, key_states.size(-2), key_states.size(-1)) + value_states = value_states.view(-1, value_states.size(-2), value_states.size(-1)) + + attention_output = flash_attn_varlen_func( + query_states, # NOTE(tj.solergibert) Shape: [70, 32, 128] + key_states, # NOTE(tj.solergibert) Shape: [70, 8, 128] + value_states, # NOTE(tj.solergibert) Shape: [70, 8, 128] + cu_seqlens_q=cu_seqlens, # NOTE(tj.solergibert) Shape: Tensor, [14] + cu_seqlens_k=cu_seqlens, # NOTE(tj.solergibert) Shape: Tensor, [14] + max_seqlen_q=max_seqlen_in_batch, # NOTE(tj.solergibert) Shape: Tensor, [1] Just 1 element with the longer sequence in batch. In the HF Transformers dummy test is 7 + max_seqlen_k=max_seqlen_in_batch, # NOTE(tj.solergibert) Shape: Tensor, [1] Just 1 element with the longer sequence in batch. In the HF Transformers dummy test is 7 + causal=True, # NOTE(tj.solergibert) True + ) # NOTE(tj.solergibert) Returns out: (total, nheads, headdim). + + attention_output = ( + attention_output.contiguous() + .view(batch_size, q_length, self.n_local_q_heads * self.d_v) + .transpose(0, 1) # TODO(tj.solergibert) View is necessary, but contiguous? + ) + output = self.o_proj(attention_output) + + return output + + +class LlamaDecoderLayer(nn.Module): + def __init__( + self, + config: LlamaConfig, + parallel_config: Optional[ParallelismArgs], + tp_pg: dist.ProcessGroup, + layer_idx: int, + ): + super().__init__() + self.input_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.attn = CausalSelfAttention( + config=config, + parallel_config=parallel_config, + tp_pg=tp_pg, + layer_idx=layer_idx, + ) + + self.post_attention_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.mlp = MLP(config=config, parallel_config=parallel_config, tp_pg=tp_pg) + + def forward( + self, + hidden_states: Union[torch.Tensor, TensorPointer], + position_ids: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length] + cos: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length] + sin: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length] + ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + hidden_states = self.attn(hidden_states=hidden_states, position_ids=position_ids, cos=cos, sin=sin) + hidden_states = hidden_states + residual + + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states=hidden_states) + hidden_states = hidden_states + residual + + return { + "hidden_states": hidden_states, + "position_ids": position_ids, + "cos": cos, + "sin": sin, + } + + +class Embedding(nn.Module, AttachableStore): + def __init__(self, tp_pg: dist.ProcessGroup, config: LlamaConfig, parallel_config: Optional[ParallelismArgs]): + super().__init__() + self.token_embedding = TensorParallelEmbedding( + num_embeddings=config.vocab_size, + embedding_dim=config.hidden_size, + padding_idx=config.pad_token_id, + pg=tp_pg, + mode=parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE, + ) + self.pg = tp_pg + + # NOTE(tj.solergibert) SFT + self.position_embedding = LlamaRotaryEmbedding(config=config) + + def forward(self, input_ids: torch.Tensor, position_ids: torch.Tensor): # [batch_size, seq_length] + # TODO(tj.solergibert) Delete this store stuff ################ + store = self.get_local_store() + if store is not None: + if "past_length" in store: + store["past_length"] + else: + torch.zeros(1, dtype=torch.long, device=input_ids.device).expand(input_ids.shape[0]) + + # cumsum_mask = input_mask.cumsum(-1, dtype=torch.long) + # Store new past_length in store + # store["past_length"] = past_length + cumsum_mask[:, -1] + ################################################################ + + # NOTE(tj.solergibert) We create the cos & sin and propagate them through the pipeline so we + # don't have to create the LlamaRotaryEmbedding layer in each and every decoder layer + # We will still send the position ids for the varlen, but we will try to delete it. Computing them from + # the position ids it's not very expensive AND we keep a tensor with constant shape + cos, sin = self.position_embedding( + input_ids, position_ids + ) # TODO(tj.solergibert) We just need from inputs_ids the device type + + # Format input in `[seq_length, batch_size]` to support high TP with low batch_size + input_ids = input_ids.transpose(0, 1) + input_embeds = self.token_embedding(input_ids) + return {"input_embeds": input_embeds, "position_ids": position_ids, "cos": cos, "sin": sin} + + +class LlamaModel(nn.Module): + """Build pipeline graph""" + + def __init__( + self, + config: LlamaConfig, + parallel_context: ParallelContext, + parallel_config: Optional[ParallelismArgs], + ): + super().__init__() + + # Declare all the nodes + self.p2p = P2P(parallel_context.pp_pg, device=torch.device("cuda")) + self.config = config + self.parallel_config = parallel_config + self.parallel_context = parallel_context + self.tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE + tp_linear_async_communication = ( + parallel_config.tp_linear_async_communication if parallel_config is not None else False + ) + + self.token_position_embeddings = PipelineBlock( + p2p=self.p2p, + module_builder=Embedding, + module_kwargs={ + "tp_pg": parallel_context.tp_pg, + "config": config, + "parallel_config": parallel_config, + }, + module_input_keys={"input_ids", "position_ids"}, + module_output_keys={"input_embeds", "position_ids", "cos", "sin"}, + ) + + self.decoder = nn.ModuleList( + [ + PipelineBlock( + p2p=self.p2p, + module_builder=LlamaDecoderLayer, + module_kwargs={ + "config": config, + "parallel_config": parallel_config, + "tp_pg": parallel_context.tp_pg, + "layer_idx": layer_idx, + }, + module_input_keys={"hidden_states", "position_ids", "cos", "sin"}, + module_output_keys={"hidden_states", "position_ids", "cos", "sin"}, + ) + for layer_idx in range(config.num_hidden_layers) + ] + ) + + self.final_layer_norm = PipelineBlock( + p2p=self.p2p, + module_builder=TritonRMSNorm, + module_kwargs={"hidden_size": config.hidden_size, "eps": config.rms_norm_eps}, + module_input_keys={"input"}, + module_output_keys={"hidden_states"}, + ) # TODO + + self.lm_head = PipelineBlock( + p2p=self.p2p, + # Understand that this means that we return sharded logits that are going to need to be gathered + module_builder=TensorParallelColumnLinear, + module_kwargs={ + "in_features": config.hidden_size, + "out_features": config.vocab_size, + "pg": parallel_context.tp_pg, + "bias": False, + # TODO @thomasw21: refactor so that we store that default in a single place. + "mode": self.tp_mode, + "async_communication": tp_linear_async_communication, + }, + module_input_keys={"x"}, + module_output_keys={"logits"}, + ) + + self.cast_to_fp32 = PipelineBlock( + p2p=self.p2p, + module_builder=lambda: lambda x: x.float(), + module_kwargs={}, + module_input_keys={"x"}, + module_output_keys={"output"}, + ) + + def forward( + self, + input_ids: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length] + position_ids: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length] + ): + return self.forward_with_hidden_states(input_ids=input_ids, position_ids=position_ids)[0] + + def forward_with_hidden_states( + self, + input_ids: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length] + position_ids: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length] + ): + # all tensors are optional as most ranks don't need anything from the dataloader. + + hidden_encoder_states = self.token_position_embeddings(input_ids=input_ids, position_ids=position_ids) + + # NOTE(tj.solergibert) Rename input_embeds --> hidden_states + hidden_encoder_states["hidden_states"] = hidden_encoder_states.pop("input_embeds") + + for encoder_block in self.decoder: + hidden_encoder_states = encoder_block(**hidden_encoder_states) + + hidden_states = self.final_layer_norm(input=hidden_encoder_states["hidden_states"])["hidden_states"] + + sharded_logits = self.lm_head(x=hidden_states)["logits"] + + fp32_sharded_logits = self.cast_to_fp32(x=sharded_logits)["output"] + + return fp32_sharded_logits, hidden_states + + def get_block_compute_costs(self): + """Computes the compute cost of each block in the model so that we can do a better job of load balancing.""" + model_config = self.config + d_ff = model_config.intermediate_size + d_qkv = model_config.hidden_size // model_config.num_attention_heads + block_compute_costs = { + # CausalSelfAttention (qkv proj + attn out) + MLP + LlamaDecoderLayer: 4 * model_config.num_attention_heads * d_qkv * model_config.hidden_size + + 3 * d_ff * model_config.hidden_size, + # This is the last lm_head + TensorParallelColumnLinear: model_config.vocab_size * model_config.hidden_size, + } + return block_compute_costs + + def get_flops_per_sec(self, iteration_time_in_sec, sequence_length, global_batch_size): + """Get flops per second for a given model""" + world_size = self.parallel_context.world_pg.size() + try: + num_key_values_heads = self.config.num_key_value_heads + except AttributeError: + num_key_values_heads = self.config.num_attention_heads + + model_flops, hardware_flops = get_flops( + num_layers=self.config.num_hidden_layers, + hidden_size=self.config.hidden_size, + num_heads=self.config.num_attention_heads, + num_key_value_heads=num_key_values_heads, + vocab_size=self.config.vocab_size, + ffn_hidden_size=self.config.intermediate_size, + seq_len=sequence_length, + batch_size=global_batch_size, + ) + + model_flops_per_s = model_flops / (iteration_time_in_sec * world_size * 1e12) + hardware_flops_per_s = hardware_flops / (iteration_time_in_sec * world_size * 1e12) + return model_flops_per_s, hardware_flops_per_s + + +@torch.jit.script +def masked_mean(loss, label_mask, dtype): + # type: (Tensor, Tensor, torch.dtype) -> Tensor + return (loss * label_mask).sum(dtype=dtype) / label_mask.sum() + + +class Loss(nn.Module): + def __init__(self, tp_pg: dist.ProcessGroup): + super().__init__() + self.tp_pg = tp_pg + + def forward( + self, + sharded_logits: torch.Tensor, # [seq_length, batch_size, logits] + label_ids: torch.Tensor, # [batch_size, seq_length] + label_mask: torch.Tensor, # [batch_size, seq_length] + ) -> Dict[str, torch.Tensor]: + # Megatron by defaults cast everything in fp32. `--f16-lm-cross-entropy` is an option you can use to keep current precision. + # https://github.com/NVIDIA/Megatron-LM/blob/f267e6186eae1d6e2055b412b00e2e545a8e896a/megatron/model/gpt_model.py#L38 + + loss = sharded_cross_entropy( + sharded_logits, label_ids.transpose(0, 1).contiguous(), group=self.tp_pg, dtype=torch.float + ).transpose(0, 1) + # TODO @thomasw21: It's unclear what kind of normalization we want to do. + loss = masked_mean(loss, label_mask, dtype=torch.float) + # I think indexing causes a sync we don't actually want + # loss = loss[label_mask].sum() + return {"loss": loss} + + +class LlamaForSFT(NanotronModel): + def __init__( + self, + config: LlamaConfig, + parallel_context: ParallelContext, + parallel_config: Optional[ParallelismArgs], + random_states: Optional[RandomStates] = None, + ): + super().__init__() + self.model = LlamaModel(config=config, parallel_context=parallel_context, parallel_config=parallel_config) + self.loss = PipelineBlock( + p2p=self.model.p2p, + module_builder=Loss, + module_kwargs={"tp_pg": parallel_context.tp_pg}, + module_input_keys={ + "sharded_logits", + "label_ids", + "label_mask", + }, + module_output_keys={"loss"}, + ) + self.parallel_context = parallel_context + self.config = config + self.parallel_config = parallel_config + + def forward( + self, + input_ids: Union[torch.Tensor, TensorPointer], + position_ids: Union[torch.Tensor, TensorPointer], # [batch_size, seq_length] + label_ids: Union[torch.Tensor, TensorPointer], + label_mask: Union[torch.Tensor, TensorPointer], + ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + sharded_logits = self.model( + input_ids=input_ids, + position_ids=position_ids, + ) + loss = self.loss( + sharded_logits=sharded_logits, + label_ids=label_ids, + label_mask=label_mask, + )["loss"] + return {"loss": loss} + + @torch.no_grad() + def init_model_randomly(self, config: Config): + """Initialize model parameters randomly. + Note: + Layernorm weight all 0 or 1 depending on `apply_layernorm_1p` + """ + init_method = config.model.init_method + if isinstance(init_method, RandomInit): + parametrizator_cls = StandardParametrizator + elif isinstance(init_method, SpectralMupInit): + parametrizator_cls = SpectralMupParametrizator + else: + raise ValueError(f"Unknown init method {init_method}") + + parametrizator = parametrizator_cls(config=config.model) + + log_rank( + f"Parametrizing model parameters using {parametrizator.__class__.__name__}", + logger=logger, + level=logging.INFO, + rank=0, + ) + + model = self + initialized_parameters = set() + # Handle tensor parallelism + module_id_to_prefix = {id(module): f"{module_name}." for module_name, module in model.named_modules()} + # Fix the root_model + module_id_to_prefix[id(model)] = "" + + for param_name, param in model.named_parameters(): + assert isinstance(param, NanotronParameter) + + module_name, param_name = param_name.rsplit(".", 1) + + if param.is_tied: + tied_info = param.get_tied_info() + full_param_name = tied_info.get_full_name_from_module_id_to_prefix( + module_id_to_prefix=module_id_to_prefix + ) + else: + full_param_name = f"{module_name}.{param_name}" + + if full_param_name in initialized_parameters: + # Already initialized + continue + + module = model.get_submodule(module_name) + parametrizator.parametrize(param_name, module) + + assert full_param_name not in initialized_parameters + initialized_parameters.add(full_param_name) + + assert initialized_parameters == { + param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix) + if param.is_tied + else name + for name, param in model.named_parameters() + }, f"Somehow the initialized set of parameters don't match:\n - Expected: { {name for name, _ in model.named_parameters()} }\n - Got: {initialized_parameters}" + + def get_embeddings_lm_head_tied_names(self): + """Get the names of the tied embeddings and lm_head weights""" + if self.config.tie_word_embeddings is True: + return ["model.token_position_embeddings.pp_block.token_embedding.weight", "model.lm_head.pp_block.weight"] + else: + return [] + + def get_block_compute_costs(self): + """Computes the compute cost of each block in the model so that we can do a better job of load balancing.""" + return self.model.get_block_compute_costs() + + def get_flops_per_sec(self, iteration_time_in_sec, sequence_length, global_batch_size): + """Get flops per second for a given model""" + return self.model.get_flops_per_sec(iteration_time_in_sec, sequence_length, global_batch_size) + + +def get_flops( + num_layers, + hidden_size, + num_heads, + num_key_value_heads, + vocab_size, + seq_len, + ffn_hidden_size, + batch_size=1, +): + """Counts flops in an decoder-only model + Args: + num_layers: number of decoder layers + hidden_size: hidden size of the model + num_heads: number of heads in the model + num_key_value_heads: number of key/value heads in the model + ffn_hidden_size: hidden size of the FFN + vocab_size: size of the vocabulary + seq_len: sequence length of the decoder + batch_size: batch size + Returns: + model_flops: flops in the model (should be independent of the hardware and model implementation) + hardware_flops: flops in the hardware (actual flops performed on the hardware). Check 6.3 in https://arxiv.org/pdf/2205.05198.pdf + """ + if num_key_value_heads is None: + num_key_value_heads = num_heads + hidden_size_per_head = hidden_size // num_heads + # In the following we mark the reduced dimension with parentheses + # decoder + # self attention + ## qkv projection + decoder_qkv_proj_flops_fwd = ( + 2 * num_layers * batch_size * seq_len * (hidden_size) * num_heads * hidden_size_per_head + + 2 * num_layers * batch_size * seq_len * (hidden_size) * 2 * num_key_value_heads * hidden_size_per_head + ) + ## qk logits + decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (hidden_size_per_head) * seq_len + ## v logits + decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (seq_len) * hidden_size_per_head + ## attn out + decoder_attn_out_flops_fwd = ( + 2 * num_layers * batch_size * num_heads * seq_len * (hidden_size_per_head) * hidden_size + ) + # FF + ## 1st layer + decoder_ffn_1_flops_fwd = 4 * num_layers * batch_size * seq_len * (hidden_size) * ffn_hidden_size + ## 2nd layer + decoder_ffn_2_flops_fwd = 2 * num_layers * batch_size * seq_len * (ffn_hidden_size) * hidden_size + + decoder_flops_fwd = ( + decoder_qkv_proj_flops_fwd + + decoder_qk_logits_flops_fwd + + decoder_v_logits_flops_fwd + + decoder_attn_out_flops_fwd + + decoder_ffn_1_flops_fwd + + decoder_ffn_2_flops_fwd + ) + + # lm head + lm_head_flops_fwd = 2 * batch_size * seq_len * (hidden_size) * vocab_size + + # the bwd pass requires double the flops in case of matmuls to calculate the gradients with respect to + # both input and weight tensors + model_flops = 3 * (decoder_flops_fwd + lm_head_flops_fwd) # 1 for fwd + 2 for bwd + + hardware_flops = model_flops # TODO: This is a placeholder for now + + return model_flops, hardware_flops diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index b6752f38..9984b881 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -56,7 +56,7 @@ ) from nanotron.models import NanotronModel, build_model from nanotron.models.base import check_model_has_grad -from nanotron.models.llama import LlamaForTraining, RotaryEmbedding +from nanotron.models.llama import LlamaForTraining from nanotron.models.starcoder2 import Starcoder2ForTraining from nanotron.optim.clip_grads import clip_grad_norm from nanotron.parallel import ParallelContext @@ -750,11 +750,12 @@ def _init_model( model_builder=model_builder, ) + # TODO(tj.solergibert) Fix this RoPE init only used with LlamaModel for generation? # Initialize rotary embeddings - for module in model.modules(): - if not isinstance(module, RotaryEmbedding): - continue - module.init_rotary_embeddings() + # for module in model.modules(): + # if not isinstance(module, RotaryEmbedding): + # continue + # module.init_rotary_embeddings() # Mark some parameters as tied self._mark_tied_parameters(model=model, parallel_context=parallel_context, parallel_config=parallel_config) From 03f4308b91e01d18fed4f133e475c83dca4cae6d Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Mon, 29 Jul 2024 08:38:49 +0000 Subject: [PATCH 2/9] This mess produces sames generations as hf --- convert_hf_nanotron.ipynb | 764 +++++++++++++++++++++++++----- src/nanotron/data/chat_dataset.py | 16 +- src/nanotron/data/collator.py | 21 +- src/nanotron/models/llama_sft.py | 106 ++--- 4 files changed, 690 insertions(+), 217 deletions(-) diff --git a/convert_hf_nanotron.ipynb b/convert_hf_nanotron.ipynb index 943b1af9..9bc573c3 100644 --- a/convert_hf_nanotron.ipynb +++ b/convert_hf_nanotron.ipynb @@ -24,6 +24,15 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, + "outputs": [], + "source": [ + "PATH_TO_LLAMA = \"/mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -32,21 +41,21 @@ "/home/solergib/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n", - "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 13.70it/s]\n" + "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 13.15it/s]\n" ] } ], "source": [ "from transformers import AutoModelForCausalLM\n", - "PATH_TO_LLAMA = \"/mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct\"\n", "hf_model = AutoModelForCausalLM.from_pretrained(PATH_TO_LLAMA, torch_dtype=dtype, attn_implementation=\"flash_attention_2\").to(device)\n", "# print(hf_model)\n", - "# print(hf_model.config)" + "# print(hf_model.config)\n", + "#print(hf_model.model.rotary_emb.ori_inv_freq.dtype)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -93,7 +102,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -128,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -160,9 +169,17 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.float32\n" + ] + } + ], "source": [ "from nanotron.models.llama_sft import LlamaForSFT\n", "from nanotron.models import build_model\n", @@ -183,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -194,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -203,7 +220,7 @@ "ShardedInfo(global_ranks=(0,), local_global_slices_pairs=(SlicesPair(local_slices=(slice(None, None, None), slice(None, None, None)), global_slices=(slice(0, 128256, None), slice(None, None, None))),), unsharded_shape=(128256, 4096))" ] }, - "execution_count": 8, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -214,7 +231,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -223,7 +240,7 @@ "False" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -234,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -303,10 +320,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ + "\"\"\"\n", + "import importlib\n", + "import nanotron\n", + "importlib.reload(nanotron.data.chat_dataset)\n", + "importlib.reload(nanotron.data.collator)\n", + "\"\"\"\n", + "\n", "from nanotron.data.chat_dataset import ChatDataset\n", "from nanotron.data.dataloader_builder import build_chat_dataloader\n", "\n", @@ -334,203 +358,687 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': tensor([[128000, 128006, 26380, ..., 16686, 13, 128009]],\n", + " dtype=torch.int32),\n", + " 'position_ids': tensor([[ 0, 1, 2, ..., 576, 577, 578]], dtype=torch.int32),\n", + " 'label_ids': tensor([[128006, 26380, 128007, ..., 13, 128009, 128001]],\n", + " dtype=torch.int32),\n", + " 'label_mask': tensor([[False, False, False, ..., True, True, True]])}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batch = next(iter(train_dataloader))\n", + "batch" + ] + }, + { + "cell_type": "code", + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "batch = next(iter(train_dataloader))" + "assert batch[\"input_ids\"].shape == batch[\"label_ids\"].shape \n", + "assert batch[\"input_ids\"].shape == batch[\"position_ids\"].shape\n", + "assert batch[\"input_ids\"].shape == batch[\"label_mask\"].shape" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "tensor([[128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009,\n", - " 128009, 128009, 128009, 128009, 128009, 128009]], dtype=torch.int32)" + "LlamaForSFT(\n", + " (model): LlamaModel(\n", + " (token_position_embeddings): PipelineBlock(\n", + " pp_rank=0\n", + " (pp_block): Embedding(\n", + " (token_embedding): TensorParallelEmbedding(tp_rank=0, 128256, 4096, unsharded_num_embeddings=128256)\n", + " (position_embedding): LlamaRotaryEmbedding()\n", + " )\n", + " )\n", + " (decoder): ModuleList(\n", + " (0-31): 32 x PipelineBlock(\n", + " pp_rank=0\n", + " (pp_block): LlamaDecoderLayer(\n", + " (input_layernorm): TritonRMSNorm()\n", + " (attn): CausalSelfAttention(\n", + " (qkv_proj): TensorParallelColumnLinear(tp_rank=0, in_features=4096, out_features=6144, bias=False, unsharded_out_features=6144)\n", + " (o_proj): TensorParallelRowLinear(tp_rank=0, in_features=4096, out_features=4096, bias=False, unsharded_in_features=4096)\n", + " )\n", + " (post_attention_layernorm): TritonRMSNorm()\n", + " (mlp): MLP(\n", + " (gate_up_proj): TensorParallelColumnLinear(tp_rank=0, in_features=4096, out_features=28672, bias=False, unsharded_out_features=28672)\n", + " (down_proj): TensorParallelRowLinear(tp_rank=0, in_features=14336, out_features=4096, bias=False, unsharded_in_features=14336)\n", + " (split_silu_mul): GLUActivation(\n", + " (act): SiLUActivation()\n", + " )\n", + " )\n", + " )\n", + " )\n", + " )\n", + " (final_layer_norm): PipelineBlock(\n", + " pp_rank=0\n", + " (pp_block): TritonRMSNorm()\n", + " )\n", + " (lm_head): PipelineBlock(\n", + " pp_rank=0\n", + " (pp_block): TensorParallelColumnLinear(tp_rank=0, in_features=4096, out_features=128256, bias=False, unsharded_out_features=128256)\n", + " )\n", + " (cast_to_fp32): PipelineBlock(pp_rank=0)\n", + " )\n", + " (loss): PipelineBlock(\n", + " pp_rank=0\n", + " (pp_block): Loss()\n", + " )\n", + ")" ] }, - "execution_count": 31, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "batch[\"input_ids\"][:, -150:]" + "# TODO(tj.solergibert) Comparar LlamaModel vs LlamaModel, nada de causal ni SFT!\n", + "# TODO(tj.solergibert) Vale, ya lo estabamos haciendo.\n", + "# TODO(tj.solergibert) Quedaria revisar lo de la LOSS, mierda. Tendremos que hacer una reduccion y usar la de pytorch\n", + "# TODO(tj.solergibert) Para asegurarnos que todo bien Y LUEGO YA SI ESO LO DE LA MASK.\n", + "hf_model.eval()\n", + "nanotron_model.eval()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1 a 1" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "input_ids = batch[\"input_ids\"].cuda()\n", + "position_ids = batch[\"position_ids\"].cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "n_embedd = nanotron_model.model.token_position_embeddings(input_ids=input_ids, position_ids=position_ids)\n", + "n_embedd[\"hidden_states\"] = n_embedd.pop(\"input_embeds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "hf_embedd = hf_model.model.embed_tokens(input_ids)\n", + "hf_position_embeddings = hf_model.model.rotary_emb(hf_embedd, position_ids)" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "assert_close(n_embedd[\"hidden_states\"].transpose(0,1), hf_embedd) # TODO(tj.solergibert) Embeddings now are equal!\n", + "assert_close(n_embedd[\"cos\"], hf_position_embeddings[0])\n", + "assert_close(n_embedd[\"sin\"], hf_position_embeddings[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n" + ] + } + ], + "source": [ + "n_hidden_encoder_states = nanotron_model.model.decoder[0](**n_embedd)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "tensor([[128000, 128006, 26380, ..., 13, 128009, 128001]],\n", - " dtype=torch.int32)" + "{'hidden_states': tensor([[[ 0.0014, 0.0040, -0.0050, ..., 0.0093, -0.0007, 0.0005]],\n", + " \n", + " [[ 0.0065, 0.0144, 0.0079, ..., -0.0157, -0.0422, -0.0073]],\n", + " \n", + " [[-0.0117, -0.0225, 0.0166, ..., -0.0114, -0.0019, 0.0105]],\n", + " \n", + " ...,\n", + " \n", + " [[ 0.0205, 0.0003, -0.0043, ..., -0.0337, 0.0027, -0.0114]],\n", + " \n", + " [[ 0.0017, -0.0008, 0.0084, ..., 0.0054, 0.0016, 0.0060]],\n", + " \n", + " [[-0.0025, -0.0031, -0.0141, ..., -0.0088, 0.0073, 0.0090]]],\n", + " device='cuda:0', dtype=torch.bfloat16, grad_fn=),\n", + " 'position_ids': tensor([[ 0, 1, 2, ..., 576, 577, 578]], device='cuda:0',\n", + " dtype=torch.int32),\n", + " 'cos': tensor([[[ 1.0000, 1.0000, 1.0000, ..., 1.0000, 1.0000, 1.0000],\n", + " [ 0.5391, 0.6875, 0.7891, ..., 1.0000, 1.0000, 1.0000],\n", + " [-0.4160, -0.0583, 0.2412, ..., 1.0000, 1.0000, 1.0000],\n", + " ...,\n", + " [-0.4629, -0.4336, 0.5078, ..., 1.0000, 1.0000, 1.0000],\n", + " [ 0.4941, 0.3574, 0.9297, ..., 1.0000, 1.0000, 1.0000],\n", + " [ 1.0000, 0.9258, 0.9609, ..., 1.0000, 1.0000, 1.0000]]],\n", + " device='cuda:0', dtype=torch.bfloat16),\n", + " 'sin': tensor([[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00,\n", + " 0.0000e+00, 0.0000e+00],\n", + " [ 8.3984e-01, 7.2656e-01, 6.1719e-01, ..., 3.6955e-06,\n", + " 3.0100e-06, 2.4587e-06],\n", + " [ 9.1016e-01, 1.0000e+00, 9.6875e-01, ..., 7.3910e-06,\n", + " 6.0201e-06, 4.9174e-06],\n", + " ...,\n", + " [-8.8672e-01, -9.0234e-01, -8.6328e-01, ..., 2.1362e-03,\n", + " 1.7395e-03, 1.4114e-03],\n", + " [-8.6719e-01, -9.3359e-01, -3.6719e-01, ..., 2.1362e-03,\n", + " 1.7395e-03, 1.4191e-03],\n", + " [-5.2979e-02, -3.8086e-01, 2.8320e-01, ..., 2.1362e-03,\n", + " 1.7395e-03, 1.4191e-03]]], device='cuda:0', dtype=torch.bfloat16)}" ] }, - "execution_count": 32, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "batch[\"input_ids\"][:, :-150]" + "n_hidden_encoder_states" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n" + ] + } + ], + "source": [ + "hf_hidden = hf_model.model.layers[0](hf_embedd, position_ids=position_ids, position_embeddings=hf_position_embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "LlamaForCausalLM(\n", - " (model): LlamaModel(\n", - " (embed_tokens): Embedding(128256, 4096)\n", - " (layers): ModuleList(\n", - " (0-31): 32 x LlamaDecoderLayer(\n", - " (self_attn): LlamaFlashAttention2(\n", - " (q_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", - " (k_proj): Linear(in_features=4096, out_features=1024, bias=False)\n", - " (v_proj): Linear(in_features=4096, out_features=1024, bias=False)\n", - " (o_proj): Linear(in_features=4096, out_features=4096, bias=False)\n", - " (rotary_emb): LlamaRotaryEmbedding()\n", - " )\n", - " (mlp): LlamaMLP(\n", - " (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)\n", - " (up_proj): Linear(in_features=4096, out_features=14336, bias=False)\n", - " (down_proj): Linear(in_features=14336, out_features=4096, bias=False)\n", - " (act_fn): SiLU()\n", - " )\n", - " (input_layernorm): LlamaRMSNorm()\n", - " (post_attention_layernorm): LlamaRMSNorm()\n", - " )\n", - " )\n", - " (norm): LlamaRMSNorm()\n", - " (rotary_emb): LlamaRotaryEmbedding()\n", - " )\n", - " (lm_head): Linear(in_features=4096, out_features=128256, bias=False)\n", - ")" + "(tensor([[[ 0.0014, 0.0040, -0.0050, ..., 0.0093, -0.0007, 0.0005],\n", + " [ 0.0064, 0.0146, 0.0078, ..., -0.0157, -0.0425, -0.0073],\n", + " [-0.0117, -0.0225, 0.0167, ..., -0.0115, -0.0018, 0.0106],\n", + " ...,\n", + " [ 0.0205, 0.0004, -0.0043, ..., -0.0334, 0.0027, -0.0114],\n", + " [ 0.0017, -0.0008, 0.0084, ..., 0.0054, 0.0016, 0.0061],\n", + " [-0.0025, -0.0032, -0.0141, ..., -0.0087, 0.0073, 0.0090]]],\n", + " device='cuda:0', dtype=torch.bfloat16, grad_fn=),)" ] }, - "execution_count": 14, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "nanotron_model.eval()\n", - "hf_model.eval()" + "hf_hidden" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 40, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "AssertionError", + "evalue": "Tensor-likes are not close!\n\nMismatched elements: 1151415 / 7770112 (14.8%)\nGreatest absolute difference: 0.001953125 at index (0, 442, 3824) (up to 1e-05 allowed)\nGreatest relative difference: inf at index (0, 2, 2232) (up to 0.016 allowed)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[40], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_hidden_encoder_states\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhidden_states\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhf_hidden\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", + "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 1151415 / 7770112 (14.8%)\nGreatest absolute difference: 0.001953125 at index (0, 442, 3824) (up to 1e-05 allowed)\nGreatest relative difference: inf at index (0, 2, 2232) (up to 0.016 allowed)" + ] + } + ], "source": [ - "with torch.no_grad():\n", - " output_nanotron = nanotron_model.model(input_ids=batch[\"input_ids\"][:, :-150].cuda(), position_ids = batch[\"position_ids\"][:, :-150].cuda())" + "assert_close(n_hidden_encoder_states[\"hidden_states\"].transpose(0,1), hf_hidden[0])" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[ 0.0014, 0.0040, -0.0050, ..., 0.0093, -0.0007, 0.0005],\n", + " [ 0.0060, 0.0125, 0.0074, ..., -0.0181, -0.0356, -0.0070],\n", + " [-0.0164, -0.0225, 0.0219, ..., -0.0098, -0.0084, 0.0156],\n", + " ...,\n", + " [ 0.0121, 0.0106, -0.0149, ..., -0.0229, -0.0056, -0.0021],\n", + " [ 0.0065, 0.0256, -0.0107, ..., -0.0027, -0.0085, 0.0192],\n", + " [ 0.0025, 0.0199, -0.0267, ..., -0.0056, -0.0045, 0.0182]]],\n", + " device='cuda:0', dtype=torch.bfloat16, grad_fn=)" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_hidden_encoder_states[\"hidden_states\"].transpose(0,1)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[ 0.0014, 0.0040, -0.0050, ..., 0.0093, -0.0007, 0.0005],\n", + " [ 0.0064, 0.0146, 0.0078, ..., -0.0157, -0.0425, -0.0073],\n", + " [-0.0117, -0.0225, 0.0167, ..., -0.0115, -0.0018, 0.0106],\n", + " ...,\n", + " [ 0.0205, 0.0004, -0.0043, ..., -0.0334, 0.0027, -0.0114],\n", + " [ 0.0017, -0.0008, 0.0084, ..., 0.0054, 0.0016, 0.0061],\n", + " [-0.0025, -0.0032, -0.0141, ..., -0.0087, 0.0073, 0.0090]]],\n", + " device='cuda:0', dtype=torch.bfloat16, grad_fn=)" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "hf_hidden[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n", - "PEPEPEPEPE\n" + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n" ] } ], "source": [ - "with torch.no_grad():\n", - " output_hf = hf_model(input_ids=batch[\"input_ids\"][:, :-150].cuda(), position_ids = batch[\"position_ids\"][:, :-150].cuda())" + "with torch.inference_mode():\n", + " output_nanotron = nanotron_model.model(input_ids=batch[\"input_ids\"].cuda(), position_ids = batch[\"position_ids\"].cuda())" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n", + "tensor(579, device='cuda:0', dtype=torch.int32)\n", + "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", + " dtype=torch.int32)\n" + ] + } + ], + "source": [ + "with torch.inference_mode():\n", + " output_hf = hf_model(input_ids=batch[\"input_ids\"].cuda(), position_ids = batch[\"position_ids\"].cuda())" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 4.9688, 6.1562, 10.8750, ..., -3.6406, -3.6406, -3.6406]],\n", + " device='cuda:0')" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_hf.logits[:,0,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[ 4.9375, 6.0938, 10.7500, ..., -3.6719, -3.6719, -3.6719]],\n", + " device='cuda:0')" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_nanotron.transpose(0,1)[:,0,:]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, "metadata": {}, "outputs": [ { "ename": "AssertionError", - "evalue": "Tensor-likes are not close!\n\nMismatched elements: 243083431 / 243429888 (99.9%)\nGreatest absolute difference: 46.65625 at index (0, 1125, 22) (up to 1e-05 allowed)\nGreatest relative difference: 74448896.0 at index (0, 715, 31230) (up to 1.3e-06 allowed)", + "evalue": "Tensor-likes are not close!\n\nMismatched elements: 1143 / 128256 (0.9%)\nGreatest absolute difference: 0.5859375 at index (0, 12592) (up to 0.1 allowed)\nGreatest relative difference: 279.8438720703125 at index (0, 40526) (up to 0.1 allowed)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[38], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtesting\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m assert_close\n\u001b[0;32m----> 3\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_hf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_nanotron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[45], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtesting\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m assert_close\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# TODO(tj.solergibert) Ojo este test es solo de la position 0 jajajjajajajajajajaj\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_hf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_nanotron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrtol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43matol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-1\u001b[39;49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", - "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 243083431 / 243429888 (99.9%)\nGreatest absolute difference: 46.65625 at index (0, 1125, 22) (up to 1e-05 allowed)\nGreatest relative difference: 74448896.0 at index (0, 715, 31230) (up to 1.3e-06 allowed)" + "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 1143 / 128256 (0.9%)\nGreatest absolute difference: 0.5859375 at index (0, 12592) (up to 0.1 allowed)\nGreatest relative difference: 279.8438720703125 at index (0, 40526) (up to 0.1 allowed)" ] } ], "source": [ "from torch.testing import assert_close\n", "\n", + "# TODO(tj.solergibert) Ojo este test es solo de la position 0 jajajjajajajajajajaj\n", + "\n", + "assert_close(output_hf.logits[:,0,:], output_nanotron.transpose(0,1)[:,0,:], rtol=1e-1, atol=1e-1)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "Tensor-likes are not close!\n\nMismatched elements: 217458927 / 243301632 (89.4%)\nGreatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 1e-05 allowed)\nGreatest relative difference: 1744897.0 at index (0, 1435, 64528) (up to 1.3e-06 allowed)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[46], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_hf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_nanotron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", + "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 217458927 / 243301632 (89.4%)\nGreatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 1e-05 allowed)\nGreatest relative difference: 1744897.0 at index (0, 1435, 64528) (up to 1.3e-06 allowed)" + ] + } + ], + "source": [ "assert_close(output_hf.logits, output_nanotron.transpose(0,1))" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 47, "metadata": {}, "outputs": [ { @@ -562,23 +1070,23 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 48, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[Nanotron Model] Next token: 220, probability: 0.0804644376039505\n", - "[Nanotron Model] Next token: 994, probability: 0.029601214453577995\n", - "[Nanotron Model] Next token: 3639, probability: 0.02612297795712948\n", - "[Nanotron Model] Next token: 656, probability: 0.024540266022086143\n", - "[Nanotron Model] Next token: 279, probability: 0.024540266022086143\n", - "[Nanotron Model] Next token: 3277, probability: 0.021656708791851997\n", - "[Nanotron Model] Next token: 264, probability: 0.013982621021568775\n", - "[Nanotron Model] Next token: 1148, probability: 0.01022990420460701\n", - "[Nanotron Model] Next token: 507, probability: 0.01022990420460701\n", - "[Nanotron Model] Next token: 323, probability: 0.01022990420460701\n" + "[Nanotron Model] Next token: 11415, probability: 0.10305546224117279\n", + "[Nanotron Model] Next token: 1523, probability: 0.048679955303668976\n", + "[Nanotron Model] Next token: 47032, probability: 0.04295990616083145\n", + "[Nanotron Model] Next token: 10477, probability: 0.04035709798336029\n", + "[Nanotron Model] Next token: 3493, probability: 0.04035709798336029\n", + "[Nanotron Model] Next token: 72514, probability: 0.03791198879480362\n", + "[Nanotron Model] Next token: 16805, probability: 0.031430136412382126\n", + "[Nanotron Model] Next token: 10552, probability: 0.027737000957131386\n", + "[Nanotron Model] Next token: 7664, probability: 0.02299478091299534\n", + "[Nanotron Model] Next token: 3041, probability: 0.017908351495862007\n" ] } ], diff --git a/src/nanotron/data/chat_dataset.py b/src/nanotron/data/chat_dataset.py index ac46ba42..240b9e8e 100644 --- a/src/nanotron/data/chat_dataset.py +++ b/src/nanotron/data/chat_dataset.py @@ -116,23 +116,23 @@ def __iter__(self): buffer_lengths = [len(tokens)] # Pad to max_buffer_token_len. Pad token added in ChatTokenizer init if necessary - sample_tokens.extend( - [self.chat_tokenizer.tokenizer.pad_token_id] * (max_buffer_token_len - len(sample_tokens)) - ) - sample_completitions.extend([False] * (max_buffer_token_len - len(sample_completitions))) + # sample_tokens.extend( + # [self.chat_tokenizer.tokenizer.pad_token_id] * (max_buffer_token_len - len(sample_tokens)) + # ) + # sample_completitions.extend([False] * (max_buffer_token_len - len(sample_completitions))) # TODO delete, just 4 switching the training only on completitions setting - labels = self.create_labels(sample_tokens, sample_completitions) + self.create_labels(sample_tokens, sample_completitions) # TODO delete, just 4 switching the remove cross-attention setting position_ids = self.create_position_ids(sample_lengths, self.sequence_length) # TODO delete (debug) - assert len(sample_tokens) == max_buffer_token_len + # assert len(sample_tokens) == max_buffer_token_len yield { - "input_ids": np.array(sample_tokens[:-1], dtype=np.int32), - "label_ids": labels, + "input_ids": np.array(sample_tokens, dtype=np.int32), + "is_completitions": np.array(sample_completitions, dtype=np.bool_), "position_ids": position_ids, } diff --git a/src/nanotron/data/collator.py b/src/nanotron/data/collator.py index b34a7369..92138fe4 100644 --- a/src/nanotron/data/collator.py +++ b/src/nanotron/data/collator.py @@ -84,7 +84,7 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni # We could compute position ids after tokenizing each sample but we will still miss the last length of the padding tokens def build_position_ids(lengths, sequence_length) -> np.array: position_ids = [list(range(length)) for length in lengths] # Create position ids list - position_ids.append([0] * (sequence_length - sum(lengths))) # Append position_ids of the padding tokens + # position_ids.append([0] * (sequence_length - sum(lengths))) # Append position_ids of the padding tokens return np.array([x for xs in position_ids for x in xs], dtype=np.int32) # Flatten list of position ids @@ -132,33 +132,32 @@ def __call__(self, examples: List[Dict[str, List[int]]]) -> Dict[str, Union[torc assert all(len(example) == 0 for example in examples) return { "input_ids": TensorPointer(group_rank=self.input_pp_rank), - "input_mask": TensorPointer(group_rank=self.input_pp_rank), "label_ids": TensorPointer(group_rank=self.output_pp_rank), + "position_ids": TensorPointer(group_rank=self.input_pp_rank), "label_mask": TensorPointer(group_rank=self.output_pp_rank), } - # TODO clean this, as we are flatting the batch there is no necessity for vstack but we need the batch dimension too + # TODO(tj.solergibert) Clean this, as we are flattening the batch there is no necessity for vstack but we need the batch dimension too input_ids = np.vstack([examples[i]["input_ids"] for i in range(len(examples))]) # (b, s) - label_ids = np.vstack([examples[i]["label_ids"] for i in range(len(examples))]) # (b, s) + is_completitions = np.vstack([examples[i]["is_completitions"] for i in range(len(examples))]) # (b, s) position_ids = np.vstack([examples[i]["position_ids"] for i in range(len(examples))]) # (b, s) result: Dict[str, Union[np.ndarray, TensorPointer]] = {} result["input_ids"] = TensorPointer(group_rank=self.input_pp_rank) - result["input_mask"] = TensorPointer(group_rank=self.input_pp_rank) + result["position_ids"] = TensorPointer(group_rank=self.input_pp_rank) result["label_ids"] = TensorPointer(group_rank=self.output_pp_rank) result["label_mask"] = TensorPointer(group_rank=self.output_pp_rank) # Process inputs if current_pp_rank == self.input_pp_rank: - result["input_ids"] = input_ids - result["input_mask"] = np.ones((1, self.sequence_length), dtype=np.bool_) - result["position_ids"] = position_ids + result["input_ids"] = input_ids[:, :-1] + result["position_ids"] = position_ids[:, :-1] - # Process labels: shift them to the left + # Process labels: shift them to the left. if current_pp_rank == self.output_pp_rank: - result["label_ids"] = label_ids - result["label_mask"] = np.ones((1, self.sequence_length), dtype=np.bool_) + result["label_ids"] = input_ids[:, 1:] + result["label_mask"] = is_completitions[:, 1:] # Cast np.array to torch.Tensor result = {k: v if isinstance(v, TensorPointer) else torch.from_numpy(v) for k, v in result.items()} diff --git a/src/nanotron/models/llama_sft.py b/src/nanotron/models/llama_sft.py index a7ccb9d2..9774ca7e 100644 --- a/src/nanotron/models/llama_sft.py +++ b/src/nanotron/models/llama_sft.py @@ -42,7 +42,6 @@ ) from nanotron.random import RandomStates from nanotron.scaling.parametrization import SpectralMupParametrizator, StandardParametrizator -from nanotron.utils import checkpoint_method logger = logging.get_logger(__name__) @@ -61,16 +60,14 @@ def _compute_default_rope_parameters( inv_freq (torch.Tensor) Contains the inverse frequencies for the RoPE embeddings """ + with torch.autocast(device_type="cuda", enabled=False): + base = config.rope_theta # NOTE(tj.solergibert) 500000.0 + dim = int(config.hidden_size // config.num_attention_heads) # NOTE(tj.solergibert) 128 - base = config.rope_theta # NOTE(tj.solergibert) 500000.0 - partial_rotary_factor = ( - config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 - ) # NOTE(tj.solergibert) 1 - dim = int((config.hidden_size // config.num_attention_heads) * partial_rotary_factor) # NOTE(tj.solergibert) 128 - - # Compute the inverse frequencies - inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)) - return inv_freq + # Compute the inverse frequencies + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)).cuda() + print(inv_freq.dtype) + return inv_freq # NOTE(tj.solergibert) Copied from https://github.com/huggingface/transformers/blob/5f841c74b62754f186a8c06a684d491524b7bc03/src/transformers/models/llama/modeling_llama.py#L81 @@ -85,9 +82,11 @@ def __init__( super().__init__() self.config = config - inv_freq = _compute_default_rope_parameters(self.config) # NOTE(tj.solergibert) shape: 64 , 1.0 - self.register_buffer("inv_freq", inv_freq, persistent=False) - self.original_inv_freq = self.inv_freq + self.inv_freq = _compute_default_rope_parameters(self.config) # NOTE(tj.solergibert) shape: 64 , 1.0 + # print(inv_freq.dtype) + # self.register_buffer("inv_freq", inv_freq, persistent=False) + # print(self.inv_freq.dtype) # TODO(tj.solergibert) register_buffer casts to bf16!!!! + # self.original_inv_freq = inv_freq @torch.no_grad() def forward(self, x, position_ids): @@ -212,46 +211,6 @@ def forward(self, hidden_states): # [seq_length, batch_size, hidden_dim] return hidden_states -class CoreAttention(nn.Module): - def __init__(self, config: LlamaConfig, parallel_config: Optional[ParallelismArgs], layer_idx: int): - super().__init__() - # TODO @thomasw21: GPT has a weird `d_kv` config which I'm guessing is essentically a `d_qkv` - assert ( - config.hidden_size % config.num_attention_heads == 0 - ), f"Hidden size {config.hidden_size} must be divisible by number of attention heads {config.num_attention_heads}." - self.d_qk = config.hidden_size // config.num_attention_heads - self.d_v = config.hidden_size // config.num_attention_heads - self.is_using_mup = config.is_using_mup - - self.checkpoint_attention = False # Because flash_attn already does checkpointing - - @checkpoint_method(attr_name="checkpoint_attention") - def forward( - self, - query_states: torch.Tensor, # [batch_size, q_length, n_local_q_heads, inner_dim] - key_states: torch.Tensor, # [batch_size, kv_length, n_local_kv_heads, inner_dim] - value_states: torch.Tensor, # [batch_size, kv_length, n_local_kv_heads, inner_dim] - ): - from flash_attn.flash_attn_interface import flash_attn_func - - # NOTE: this scale is for µTransfer, - # in SP, we use sqrt(1/d_h) - softmax_scale = 1 / query_states.shape[-1] if self.is_using_mup else None - # For now we are assuming that we use causual mask. No magic here - causal = True - attn_output = flash_attn_func( - q=query_states, - k=key_states, - v=value_states, - dropout_p=0.0, - softmax_scale=softmax_scale, - causal=causal, - return_attn_probs=False, - ) - - return attn_output - - class CausalSelfAttention(nn.Module, AttachableStore): def __init__( self, @@ -289,7 +248,6 @@ def __init__( self.d_qk = config.hidden_size // config.num_attention_heads self.d_v = config.hidden_size // config.num_attention_heads self.d_model = config.hidden_size - self.is_using_mup = config.is_using_mup # TODO @thomasw21: refactor so that we store that default in a single place. tp_mode = parallel_config.tp_mode if parallel_config is not None else TensorParallelLinearMode.ALL_REDUCE @@ -323,13 +281,6 @@ def __init__( async_communication=tp_linear_async_communication, ) - # TODO(tj.solergibert) Deshacernos de este bloque POR DIOS!!! - self.attention = CoreAttention( - config, - parallel_config=parallel_config, - layer_idx=layer_idx, - ) - def forward( self, hidden_states, # [seq_length, batch_size, hidden_size] @@ -354,17 +305,27 @@ def forward( ) query_states = ( - query_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_q_heads, self.d_qk) + query_states.transpose(0, 1) + .contiguous() + .view( + batch_size, q_length, self.n_local_q_heads, self.d_qk + ) # TODO(tj.solergibert) q_length to -1 BUT q_lenght is already well computed ) key_states = ( - key_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) + key_states.transpose(0, 1) + .contiguous() + .view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) # TODO(tj.solergibert) q_length to -1 ) value_states = ( - value_states.transpose(0, 1).contiguous().view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) + value_states.transpose(0, 1) + .contiguous() + .view(batch_size, q_length, self.n_local_kv_heads, self.d_qk) # TODO(tj.solergibert) q_length to -1 ) else: query_states, key_states, value_states = ( - qkv_states.view(q_length, batch_size, 3, self.n_local_q_heads, self.d_qk) + qkv_states.view( + q_length, batch_size, 3, self.n_local_q_heads, self.d_qk + ) # TODO(tj.solergibert) q_length to -1 .permute(2, 1, 0, 3, 4) .contiguous() ) # [3, batch_size, seq_length, n_local_q_heads, d_qk] @@ -419,7 +380,9 @@ def forward( attention_output = ( attention_output.contiguous() - .view(batch_size, q_length, self.n_local_q_heads * self.d_v) + .view( + batch_size, q_length, self.n_local_q_heads * self.d_v + ) # TODO(tj.solergibert) q_length to -1. Also take care of batch size will be always 1 .transpose(0, 1) # TODO(tj.solergibert) View is necessary, but contiguous? ) output = self.o_proj(attention_output) @@ -503,17 +466,18 @@ def forward(self, input_ids: torch.Tensor, position_ids: torch.Tensor): # [batc # store["past_length"] = past_length + cumsum_mask[:, -1] ################################################################ + # Format input in `[seq_length, batch_size]` to support high TP with low batch_size + input_ids = input_ids.transpose(0, 1) + input_embeds = self.token_embedding(input_ids) + # NOTE(tj.solergibert) We create the cos & sin and propagate them through the pipeline so we # don't have to create the LlamaRotaryEmbedding layer in each and every decoder layer # We will still send the position ids for the varlen, but we will try to delete it. Computing them from # the position ids it's not very expensive AND we keep a tensor with constant shape cos, sin = self.position_embedding( - input_ids, position_ids + input_embeds, position_ids ) # TODO(tj.solergibert) We just need from inputs_ids the device type - # Format input in `[seq_length, batch_size]` to support high TP with low batch_size - input_ids = input_ids.transpose(0, 1) - input_embeds = self.token_embedding(input_ids) return {"input_embeds": input_embeds, "position_ids": position_ids, "cos": cos, "sin": sin} @@ -669,6 +633,8 @@ def get_flops_per_sec(self, iteration_time_in_sec, sequence_length, global_batch return model_flops_per_s, hardware_flops_per_s +# TODO(tj.solergibert) OJO con la label mask!!! Tal vez necesitamos hacer algo con los input ids!! +# TODO(tj.solergibert) A pero espera, si esta a -100 ya basta no? Habria que comprobar eso con la loss esa rara que hacemos, mierda!! @torch.jit.script def masked_mean(loss, label_mask, dtype): # type: (Tensor, Tensor, torch.dtype) -> Tensor From c57533d27a4e71565ce20a6d27834a2edac1935b Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Mon, 29 Jul 2024 09:42:29 +0000 Subject: [PATCH 3/9] Added SFT generations check script --- tools/check_sft.py | 236 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 236 insertions(+) create mode 100644 tools/check_sft.py diff --git a/tools/check_sft.py b/tools/check_sft.py new file mode 100644 index 00000000..3a2f9816 --- /dev/null +++ b/tools/check_sft.py @@ -0,0 +1,236 @@ +import torch +from nanotron.config import ParallelismArgs +from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron +from nanotron.data.chat_dataset import ChatDataset +from nanotron.data.dataloader_builder import build_chat_dataloader +from nanotron.models import build_model +from nanotron.models.llama_sft import LlamaForSFT +from nanotron.parallel import ParallelContext +from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine +from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode +from nanotron.trainer import mark_tied_parameters +from torch.testing import assert_close +from transformers import AutoModelForCausalLM, LlamaConfig + +dtype = torch.bfloat16 +device = torch.device("cuda") +PATH_TO_LLAMA = "/mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct" + +# NOTE(tj.solergibert) This script is for testing porpuses. ONLY use 1 GPU +DP = 1 +PP = 1 +TP = 1 + +# NOTE(tj.solergibert) How many K-first tokens must match +TOPK_MATCH = 3 + + +def main(): + hf_model = AutoModelForCausalLM.from_pretrained( + PATH_TO_LLAMA, torch_dtype=dtype, attn_implementation="flash_attention_2" + ).to(device) + hf_config = LlamaConfig.from_pretrained(PATH_TO_LLAMA) + + parallel_config = ParallelismArgs( + dp=DP, + pp=PP, + tp=TP, + pp_engine=AllForwardAllBackwardPipelineEngine(), + tp_mode=TensorParallelLinearMode.ALL_REDUCE, + tp_linear_async_communication=False, + ) + assert ( + parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE + and parallel_config.tp_linear_async_communication is False + ) + + parallel_context = ParallelContext( + data_parallel_size=parallel_config.dp, + pipeline_parallel_size=parallel_config.pp, + tensor_parallel_size=parallel_config.tp, + ) + + nanotron_config = LlamaConfigNanotron( + bos_token_id=hf_config.bos_token_id, + eos_token_id=hf_config.eos_token_id, + hidden_act=hf_config.hidden_act, + hidden_size=hf_config.hidden_size, + initializer_range=hf_config.initializer_range, + intermediate_size=hf_config.intermediate_size, + is_llama_config=True, + max_position_embeddings=hf_config.max_position_embeddings, + num_attention_heads=hf_config.num_attention_heads, + num_hidden_layers=hf_config.num_hidden_layers, + num_key_value_heads=hf_config.num_key_value_heads, + pad_token_id=None, + pretraining_tp=hf_config.pretraining_tp, + rms_norm_eps=hf_config.rms_norm_eps, + rope_scaling=hf_config.rope_scaling, + rope_theta=hf_config.rope_theta, + rope_interleaved=False, + tie_word_embeddings=hf_config.tie_word_embeddings, + use_cache=hf_config.use_cache, + vocab_size=hf_config.vocab_size, + ) + + nanotron_model = build_model( + model_builder=lambda: LlamaForSFT( + config=nanotron_config, + parallel_context=parallel_context, + parallel_config=parallel_config, + random_states=None, + ), + parallel_context=parallel_context, + dtype=dtype, + device=device, + ) + + mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context) + + # Copy Llama3-8B-Instruct parameters + # Token embeddings + assert ( + nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape + == hf_model.model.embed_tokens.weight.shape + ) + + with torch.no_grad(): + nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.copy_( + hf_model.model.embed_tokens.weight + ) # = hf_model.model.embed_tokens.weight.data + + # Decoder layers + for i in range(nanotron_config.num_hidden_layers): + # Input layer norm + assert ( + hf_model.model.layers[i].input_layernorm.weight.shape + == nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.shape + ) + with torch.no_grad(): + nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.copy_( + hf_model.model.layers[i].input_layernorm.weight + ) # = hf_model.model.layers[i].input_layernorm.weight + # Self attn + ## QKV + tmp_qkv_proj = torch.cat( + [ + hf_model.model.layers[i].self_attn.q_proj.weight, + hf_model.model.layers[i].self_attn.k_proj.weight, + hf_model.model.layers[i].self_attn.v_proj.weight, + ], + dim=0, + ) + assert tmp_qkv_proj.shape == nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.shape + with torch.no_grad(): + nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.copy_( + tmp_qkv_proj + ) # = tmp_qkv_proj # torch.nn.Parameter(tmp_qkv_proj) + + ## O + assert ( + hf_model.model.layers[i].self_attn.o_proj.weight.shape + == nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.shape + ) + with torch.no_grad(): + nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.copy_( + hf_model.model.layers[i].self_attn.o_proj.weight + ) # = hf_model.model.layers[i].self_attn.o_proj.weight + # MLP + ## Gate Up Proj + tmp_gate_up_proj = torch.cat( + [ + hf_model.model.layers[i].mlp.gate_proj.weight, + hf_model.model.layers[i].mlp.up_proj.weight, + ], + dim=0, + ) + + assert tmp_gate_up_proj.shape == nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.shape + with torch.no_grad(): + nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.copy_( + tmp_gate_up_proj + ) # = tmp_gate_up_proj + ## Down Proj + assert ( + hf_model.model.layers[i].mlp.down_proj.weight.shape + == nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.shape + ) + with torch.no_grad(): + nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.copy_( + hf_model.model.layers[i].mlp.down_proj.weight + ) # = hf_model.model.layers[i].mlp.down_proj.weight + + # Post attn layer norm + assert ( + hf_model.model.layers[i].post_attention_layernorm.weight.shape + == nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.shape + ) + with torch.no_grad(): + nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.copy_( + hf_model.model.layers[i].post_attention_layernorm.weight + ) # = hf_model.model.layers[i].post_attention_layernorm.weight + + # Last layer norm + assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape + with torch.no_grad(): + nanotron_model.model.final_layer_norm.pp_block.weight.copy_( + hf_model.model.norm.weight + ) # = hf_model.model.norm.weight + # LM_Head + assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape + with torch.no_grad(): + nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight) # = hf_model.lm_head.weight + + # Create ChatDataloaders + train_dataset = ChatDataset( + dataset_path="Open-Orca/SlimOrca", + tokenizer_name_or_path=PATH_TO_LLAMA, + sequence_length=2048, + train_on_completions_only=True, + remove_cross_attention=True, + split="train", + conversation_column_name="conversations", + dp_rank=parallel_context.dp_pg.rank(), + dp_ranks_size=parallel_context.dp_pg.size(), + ) + + # Prepare dataloader + train_dataloader = build_chat_dataloader( + dataset=train_dataset, + sequence_length=2048, + parallel_context=parallel_context, + input_pp_rank=0, + output_pp_rank=0, + ) + + batch = next(iter(train_dataloader)) + # Some DL Checks + assert batch["input_ids"].shape == batch["label_ids"].shape + assert batch["input_ids"].shape == batch["position_ids"].shape + assert batch["input_ids"].shape == batch["label_mask"].shape + + hf_model.eval() + nanotron_model.eval() + + with torch.inference_mode(): + output_nanotron = nanotron_model.model( + input_ids=batch["input_ids"].cuda(), position_ids=batch["position_ids"].cuda() + ) + output_hf = hf_model(input_ids=batch["input_ids"].cuda(), position_ids=batch["position_ids"].cuda()) + + predicted_tokens = [37, 89, 125, 423, 698, 912, 1298, 1723] + for predicted_token in predicted_tokens: + next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1) + hf_topk_next_tokens = torch.topk(next_tokens_hf, 10) + + next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0, 1)[0, predicted_token, :], -1) + nanotron_topk_next_tokens = torch.topk(next_tokens_nanotron, 10) + assert all(hf_topk_next_tokens[1][:TOPK_MATCH] == nanotron_topk_next_tokens[1][:TOPK_MATCH]) + + print("All generations match!") + # One last assertion of the logits + assert_close(output_hf.logits, output_nanotron.transpose(0, 1), rtol=1e-1, atol=1e-1) + + +if __name__ == "__main__": + main() From a66b0c62c06789bae9316787d16c6d3201957896 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Mon, 29 Jul 2024 14:27:54 +0000 Subject: [PATCH 4/9] Added masked LOSS check --- convert_hf_nanotron.ipynb | 240 ++++++++++++++++++++----------- src/nanotron/models/llama_sft.py | 4 +- tools/check_sft.py | 89 +++++++++--- 3 files changed, 225 insertions(+), 108 deletions(-) diff --git a/convert_hf_nanotron.ipynb b/convert_hf_nanotron.ipynb index 9bc573c3..34605e00 100644 --- a/convert_hf_nanotron.ipynb +++ b/convert_hf_nanotron.ipynb @@ -41,7 +41,7 @@ "/home/solergib/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n", - "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 13.15it/s]\n" + "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 7.36it/s]\n" ] } ], @@ -322,7 +322,15 @@ "cell_type": "code", "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading readme: 100%|██████████| 2.15k/2.15k [00:00<00:00, 13.8MB/s]\n" + ] + } + ], "source": [ "\"\"\"\n", "import importlib\n", @@ -382,6 +390,37 @@ "batch" ] }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'input_ids': tensor([[128000, 128006, 26380, ..., 16686, 13, 128009]],\n", + " dtype=torch.int32), 'position_ids': tensor([[ 0, 1, 2, ..., 576, 577, 578]], dtype=torch.int32), 'label_ids': tensor([[128006, 26380, 128007, ..., 13, 128009, 128001]],\n", + " dtype=torch.int32), 'label_mask': tensor([[False, False, False, ..., True, True, True]])}\n", + "{'input_ids': tensor([[128000, 128006, 9125, ..., 27065, 13, 128009]],\n", + " dtype=torch.int32), 'position_ids': tensor([[ 0, 1, 2, ..., 517, 518, 519]], dtype=torch.int32), 'label_ids': tensor([[128006, 9125, 128007, ..., 13, 128009, 128001]],\n", + " dtype=torch.int32), 'label_mask': tensor([[False, False, False, ..., True, True, True]])}\n", + "{'input_ids': tensor([[128000, 128006, 9125, ..., 62491, 13, 128009]],\n", + " dtype=torch.int32), 'position_ids': tensor([[ 0, 1, 2, ..., 641, 642, 643]], dtype=torch.int32), 'label_ids': tensor([[128006, 9125, 128007, ..., 13, 128009, 128001]],\n", + " dtype=torch.int32), 'label_mask': tensor([[False, False, False, ..., True, True, True]])}\n", + "{'input_ids': tensor([[128000, 128006, 9125, ..., 15507, 13, 128009]],\n", + " dtype=torch.int32), 'position_ids': tensor([[ 0, 1, 2, ..., 86, 87, 88]], dtype=torch.int32), 'label_ids': tensor([[128006, 9125, 128007, ..., 13, 128009, 128001]],\n", + " dtype=torch.int32), 'label_mask': tensor([[False, False, False, ..., True, True, True]])}\n" + ] + } + ], + "source": [ + "for i, batch in enumerate(train_dataloader):\n", + " print(batch)\n", + " if i == 3:\n", + " break" + ] + }, { "cell_type": "code", "execution_count": 14, @@ -715,7 +754,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -828,7 +867,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -948,154 +987,187 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 20, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "tensor([[ 4.9688, 6.1562, 10.8750, ..., -3.6406, -3.6406, -3.6406]],\n", - " device='cuda:0')" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" + "ename": "AssertionError", + "evalue": "Tensor-likes are not close!\n\nMismatched elements: 1013596 / 243301632 (0.4%)\nGreatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 0.1 allowed)\nGreatest relative difference: 537153.0 at index (0, 406, 16297) (up to 0.1 allowed)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[20], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_hf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_nanotron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43matol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrtol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-1\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", + "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 1013596 / 243301632 (0.4%)\nGreatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 0.1 allowed)\nGreatest relative difference: 537153.0 at index (0, 406, 16297) (up to 0.1 allowed)" + ] } ], "source": [ - "output_hf.logits[:,0,:]" + "assert_close(output_hf.logits, output_nanotron.transpose(0,1), atol=1e-1, rtol=1e-1)" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 21, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "tensor([[ 4.9375, 6.0938, 10.7500, ..., -3.6719, -3.6719, -3.6719]],\n", - " device='cuda:0')" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "[HF Model] Next token: 704, probability: 0.9999432563781738\n", + "[HF Model] Next token: 14, probability: 3.535549694788642e-05\n", + "[HF Model] Next token: 6917, probability: 1.67007528943941e-05\n", + "[HF Model] Next token: 1057, probability: 1.5534121757809771e-06\n", + "[HF Model] Next token: 320, probability: 1.209798483614577e-06\n", + "[HF Model] Next token: 315, probability: 9.421920026397856e-07\n", + "[HF Model] Next token: 412, probability: 1.637284157141039e-07\n", + "[HF Model] Next token: 9994, probability: 9.930631250654187e-08\n", + "[HF Model] Next token: 12, probability: 8.763750969364992e-08\n", + "[HF Model] Next token: 6033, probability: 6.825216303241177e-08\n" + ] } ], "source": [ - "output_nanotron.transpose(0,1)[:,0,:]" + "predicted_token = 345\n", + "\n", + "next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1)\n", + "hf_topk_next_tokens= torch.topk(next_tokens_hf, 10)\n", + "\n", + "\n", + "print(*[f\"[HF Model] Next token: {idx.item()}, probability: {prob}\" for idx, prob in zip(hf_topk_next_tokens.indices, hf_topk_next_tokens.values)], sep=\"\\n\")" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 22, "metadata": {}, "outputs": [ { - "ename": "AssertionError", - "evalue": "Tensor-likes are not close!\n\nMismatched elements: 1143 / 128256 (0.9%)\nGreatest absolute difference: 0.5859375 at index (0, 12592) (up to 0.1 allowed)\nGreatest relative difference: 279.8438720703125 at index (0, 40526) (up to 0.1 allowed)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[45], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtesting\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m assert_close\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# TODO(tj.solergibert) Ojo este test es solo de la position 0 jajajjajajajajajajaj\u001b[39;00m\n\u001b[0;32m----> 5\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_hf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_nanotron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrtol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43matol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-1\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", - "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 1143 / 128256 (0.9%)\nGreatest absolute difference: 0.5859375 at index (0, 12592) (up to 0.1 allowed)\nGreatest relative difference: 279.8438720703125 at index (0, 40526) (up to 0.1 allowed)" + "name": "stdout", + "output_type": "stream", + "text": [ + "[Nanotron Model] Next token: 704, probability: 0.9999523162841797\n", + "[Nanotron Model] Next token: 14, probability: 3.120139808743261e-05\n", + "[Nanotron Model] Next token: 6917, probability: 1.3006677363591734e-05\n", + "[Nanotron Model] Next token: 1057, probability: 1.209809511237836e-06\n", + "[Nanotron Model] Next token: 320, probability: 9.422005859960336e-07\n", + "[Nanotron Model] Next token: 315, probability: 8.3148904650443e-07\n", + "[Nanotron Model] Next token: 412, probability: 1.2751297617796808e-07\n", + "[Nanotron Model] Next token: 9994, probability: 7.734053042440792e-08\n", + "[Nanotron Model] Next token: 12, probability: 6.825278120459188e-08\n", + "[Nanotron Model] Next token: 21337, probability: 6.023287113521292e-08\n" ] } ], "source": [ - "from torch.testing import assert_close\n", + "next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0,1)[0, predicted_token, :], -1)\n", + "nanotron_topk_next_tokens= torch.topk(next_tokens_nanotron, 10)\n", "\n", - "# TODO(tj.solergibert) Ojo este test es solo de la position 0 jajajjajajajajajajaj\n", "\n", - "assert_close(output_hf.logits[:,0,:], output_nanotron.transpose(0,1)[:,0,:], rtol=1e-1, atol=1e-1)" + "print(*[f\"[Nanotron Model] Next token: {idx.item()}, probability: {prob}\" for idx, prob in zip(nanotron_topk_next_tokens.indices, nanotron_topk_next_tokens.values)], sep=\"\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comprobar loss con las masks!\n", + "HF no have lo de train on completitions only, o si? Creo que no tiene atten mask para los labels, asi que primero lo hacemos manual y luego a mano con su formula de crossentropy a mano con los -100!" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 29, "metadata": {}, "outputs": [ { - "ename": "AssertionError", - "evalue": "Tensor-likes are not close!\n\nMismatched elements: 217458927 / 243301632 (89.4%)\nGreatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 1e-05 allowed)\nGreatest relative difference: 1744897.0 at index (0, 1435, 64528) (up to 1.3e-06 allowed)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[46], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_hf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_nanotron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", - "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 217458927 / 243301632 (89.4%)\nGreatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 1e-05 allowed)\nGreatest relative difference: 1744897.0 at index (0, 1435, 64528) (up to 1.3e-06 allowed)" + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor(0.9076, device='cuda:0')\n" ] } ], "source": [ - "assert_close(output_hf.logits, output_nanotron.transpose(0,1))" + "# Nanotron\n", + "nanotron_loss = nanotron_model.loss(\n", + " sharded_logits=output_nanotron,\n", + " label_ids=batch[\"label_ids\"].cuda(),\n", + " label_mask=batch[\"label_mask\"].cuda(),\n", + " )[\"loss\"]\n", + "print(nanotron_loss)" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def build_labels_completions_only(input_ids, is_completitions):\n", + " labels = np.where(\n", + " is_completitions, input_ids, -100\n", + " ) # Mask tokens that don't belong to the completitions by the Assistant\n", + " return torch.tensor(np.array(labels, dtype=np.int64))" + ] + }, + { + "cell_type": "code", + "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[HF Model] Next token: 11415, probability: 0.10412170737981796\n", - "[HF Model] Next token: 1523, probability: 0.04918361455202103\n", - "[HF Model] Next token: 47032, probability: 0.043404385447502136\n", - "[HF Model] Next token: 72514, probability: 0.03830423951148987\n", - "[HF Model] Next token: 3493, probability: 0.03830423951148987\n", - "[HF Model] Next token: 10477, probability: 0.03830423951148987\n", - "[HF Model] Next token: 16805, probability: 0.03175532445311546\n", - "[HF Model] Next token: 10552, probability: 0.026326090097427368\n", - "[HF Model] Next token: 7664, probability: 0.021825095638632774\n", - "[HF Model] Next token: 3041, probability: 0.018093638122081757\n" + "torch.Size([1897, 128256])\n", + "torch.Size([1897])\n", + "tensor(0.9081, device='cuda:0')\n" ] } ], "source": [ - "predicted_token = 34\n", + "# HF\n", + "from torch.nn import CrossEntropyLoss\n", "\n", - "next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1)\n", - "hf_topk_next_tokens= torch.topk(next_tokens_hf, 10)\n", + "hf_labels = build_labels_completions_only(batch[\"label_ids\"].flatten().tolist(), batch[\"label_mask\"].flatten().tolist())\n", "\n", + "shift_logits = output_hf.logits.contiguous()\n", + "shift_labels = hf_labels.contiguous()\n", + "loss_fct = CrossEntropyLoss()\n", "\n", - "print(*[f\"[HF Model] Next token: {idx.item()}, probability: {prob}\" for idx, prob in zip(hf_topk_next_tokens.indices, hf_topk_next_tokens.values)], sep=\"\\n\")" + "shift_logits = shift_logits.view(-1, 128256)\n", + "shift_labels = shift_labels.view(-1)\n", + "# Enable model parallelism\n", + "shift_labels = shift_labels.to(\"cuda\")\n", + "hf_loss = loss_fct(shift_logits, shift_labels)\n", + "print(hf_loss)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 58, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Nanotron Model] Next token: 11415, probability: 0.10305546224117279\n", - "[Nanotron Model] Next token: 1523, probability: 0.048679955303668976\n", - "[Nanotron Model] Next token: 47032, probability: 0.04295990616083145\n", - "[Nanotron Model] Next token: 10477, probability: 0.04035709798336029\n", - "[Nanotron Model] Next token: 3493, probability: 0.04035709798336029\n", - "[Nanotron Model] Next token: 72514, probability: 0.03791198879480362\n", - "[Nanotron Model] Next token: 16805, probability: 0.031430136412382126\n", - "[Nanotron Model] Next token: 10552, probability: 0.027737000957131386\n", - "[Nanotron Model] Next token: 7664, probability: 0.02299478091299534\n", - "[Nanotron Model] Next token: 3041, probability: 0.017908351495862007\n" + "ename": "AssertionError", + "evalue": "Scalars are not close!\n\nExpected 0.9080765247344971 but got 0.9075685739517212.\nAbsolute difference: 0.0005079507827758789 (up to 0.0001 allowed)\nRelative difference: 0.0005593700188697129 (up to 0.0001 allowed)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[58], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnanotron_loss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhf_loss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43matol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrtol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-4\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", + "\u001b[0;31mAssertionError\u001b[0m: Scalars are not close!\n\nExpected 0.9080765247344971 but got 0.9075685739517212.\nAbsolute difference: 0.0005079507827758789 (up to 0.0001 allowed)\nRelative difference: 0.0005593700188697129 (up to 0.0001 allowed)" ] } ], "source": [ - "next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0,1)[0, predicted_token, :], -1)\n", - "nanotron_topk_next_tokens= torch.topk(next_tokens_nanotron, 10)\n", - "\n", - "\n", - "print(*[f\"[Nanotron Model] Next token: {idx.item()}, probability: {prob}\" for idx, prob in zip(nanotron_topk_next_tokens.indices, nanotron_topk_next_tokens.values)], sep=\"\\n\")" + "assert_close(nanotron_loss, hf_loss, atol=1e-4, rtol=1e-4)" ] }, { diff --git a/src/nanotron/models/llama_sft.py b/src/nanotron/models/llama_sft.py index 9774ca7e..d8afb7e4 100644 --- a/src/nanotron/models/llama_sft.py +++ b/src/nanotron/models/llama_sft.py @@ -66,7 +66,6 @@ def _compute_default_rope_parameters( # Compute the inverse frequencies inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim)).cuda() - print(inv_freq.dtype) return inv_freq @@ -361,8 +360,7 @@ def forward( # Prepare varlen args cu_seqlens, max_seqlen_in_batch = prepare_varlen_args(position_ids) - print(cu_seqlens) - print(max_seqlen_in_batch) + query_states = query_states.view(-1, query_states.size(-2), query_states.size(-1)) key_states = key_states.view(-1, key_states.size(-2), key_states.size(-1)) value_states = value_states.view(-1, value_states.size(-2), value_states.size(-1)) diff --git a/tools/check_sft.py b/tools/check_sft.py index 3a2f9816..63c4daab 100644 --- a/tools/check_sft.py +++ b/tools/check_sft.py @@ -1,3 +1,7 @@ +""" +torchrun --nproc-per-node 1 tools/check_sft.py +""" +import numpy as np import torch from nanotron.config import ParallelismArgs from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron @@ -9,6 +13,7 @@ from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode from nanotron.trainer import mark_tied_parameters +from torch.nn import CrossEntropyLoss from torch.testing import assert_close from transformers import AutoModelForCausalLM, LlamaConfig @@ -24,6 +29,15 @@ # NOTE(tj.solergibert) How many K-first tokens must match TOPK_MATCH = 3 +BATCHES = 15 + + +def build_labels_completions_only(input_ids, is_completitions): + labels = np.where( + is_completitions, input_ids, -100 + ) # Mask tokens that don't belong to the completitions by the Assistant + return torch.tensor(np.array(labels, dtype=np.int64)) + def main(): hf_model = AutoModelForCausalLM.from_pretrained( @@ -203,33 +217,66 @@ def main(): output_pp_rank=0, ) - batch = next(iter(train_dataloader)) - # Some DL Checks - assert batch["input_ids"].shape == batch["label_ids"].shape - assert batch["input_ids"].shape == batch["position_ids"].shape - assert batch["input_ids"].shape == batch["label_mask"].shape - hf_model.eval() nanotron_model.eval() - with torch.inference_mode(): - output_nanotron = nanotron_model.model( - input_ids=batch["input_ids"].cuda(), position_ids=batch["position_ids"].cuda() - ) - output_hf = hf_model(input_ids=batch["input_ids"].cuda(), position_ids=batch["position_ids"].cuda()) + for i, batch in enumerate(train_dataloader): + if i == BATCHES: + break + print(f"Checking sample {i}!") + + # Some DL Checks + assert batch["input_ids"].shape == batch["label_ids"].shape + assert batch["input_ids"].shape == batch["position_ids"].shape + assert batch["input_ids"].shape == batch["label_mask"].shape + + with torch.inference_mode(): + output_nanotron = nanotron_model.model( + input_ids=batch["input_ids"].cuda(), position_ids=batch["position_ids"].cuda() + ) + output_hf = hf_model(input_ids=batch["input_ids"].cuda(), position_ids=batch["position_ids"].cuda()) - predicted_tokens = [37, 89, 125, 423, 698, 912, 1298, 1723] - for predicted_token in predicted_tokens: - next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1) - hf_topk_next_tokens = torch.topk(next_tokens_hf, 10) + # Assertion of the logits + # This will always fail! We aren't performing the SAME operations. Nanotron packs QKV matrices, MLP & LayerNorm is different. So we don't have to focus on MATCHING LOGITS BUT GENERATIONS + # assert_close(output_hf.logits, output_nanotron.transpose(0, 1), rtol=1e-1, atol=1e-1) + + predicted_tokens = [37, 92, 125, 423, 744, 912, 1298] + for predicted_token in predicted_tokens: + next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1) + hf_topk_next_tokens = torch.topk(next_tokens_hf, 10) + + next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0, 1)[0, predicted_token, :], -1) + nanotron_topk_next_tokens = torch.topk(next_tokens_nanotron, 10) + assert all( + hf_topk_next_tokens[1][:TOPK_MATCH] == nanotron_topk_next_tokens[1][:TOPK_MATCH] + ), f"HF: {hf_topk_next_tokens[1][:TOPK_MATCH]} \n\n{hf_topk_next_tokens[0][:TOPK_MATCH]}\n\n Nanotron: {nanotron_topk_next_tokens[1][:TOPK_MATCH]}\n\n{nanotron_topk_next_tokens[0][:TOPK_MATCH]}" + + print("All generations match!\nChecking Loss") + + # Loss check + nanotron_loss = nanotron_model.loss( + sharded_logits=output_nanotron, + label_ids=batch["label_ids"].cuda(), + label_mask=batch["label_mask"].cuda(), + )["loss"] + + # Creating labels_ids for HF loss computation + hf_labels = build_labels_completions_only( + batch["label_ids"].flatten().tolist(), batch["label_mask"].flatten().tolist() + ) + shift_logits = output_hf.logits.contiguous() + shift_labels = hf_labels.contiguous() + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, 128256) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to("cuda") + hf_loss = loss_fct(shift_logits, shift_labels) - next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0, 1)[0, predicted_token, :], -1) - nanotron_topk_next_tokens = torch.topk(next_tokens_nanotron, 10) - assert all(hf_topk_next_tokens[1][:TOPK_MATCH] == nanotron_topk_next_tokens[1][:TOPK_MATCH]) + assert_close(nanotron_loss, hf_loss, atol=1e-2, rtol=1e-2) # -3 is fine for most cases too + print("Loss match!") - print("All generations match!") - # One last assertion of the logits - assert_close(output_hf.logits, output_nanotron.transpose(0, 1), rtol=1e-1, atol=1e-1) + print("\n\n\nBoth generations and losses match!") if __name__ == "__main__": From 06af8cff0679d8aec29feb280794db6be19830b5 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Tue, 30 Jul 2024 09:31:26 +0000 Subject: [PATCH 5/9] Getting ready --- ...llama_sft.yaml => config_llama8b_sft.yaml} | 2 +- run_train.py | 1 - src/nanotron/data/chat_dataset.py | 41 ++++----- src/nanotron/data/chat_tokenizer.py | 4 +- src/nanotron/data/collator.py | 24 ++--- src/nanotron/data/dataloader_builder.py | 6 +- src/nanotron/models/llama_sft.py | 91 ++++++------------- src/nanotron/trainer.py | 7 ++ tools/check_sft.py | 16 ++-- 9 files changed, 77 insertions(+), 115 deletions(-) rename examples/{config_llama_sft.yaml => config_llama8b_sft.yaml} (97%) diff --git a/examples/config_llama_sft.yaml b/examples/config_llama8b_sft.yaml similarity index 97% rename from examples/config_llama_sft.yaml rename to examples/config_llama8b_sft.yaml index d65f7683..61dd8222 100644 --- a/examples/config_llama_sft.yaml +++ b/examples/config_llama8b_sft.yaml @@ -7,7 +7,7 @@ checkpoints: data_stages: - data: dataset: - hf_dataset: Open-Orca/SlimOrca + hf_dataset: Magpie-Align/Magpie-Pro-300K-Filtered hf_dataset_split: train conversation_column_name: conversations train_on_completions_only: true diff --git a/run_train.py b/run_train.py index 60f01373..ae89365c 100644 --- a/run_train.py +++ b/run_train.py @@ -191,7 +191,6 @@ def get_dataloader_from_data_stage( # Prepare dataloader train_dataloader = build_chat_dataloader( dataset=train_dataset, - sequence_length=trainer.sequence_length, parallel_context=trainer.parallel_context, input_pp_rank=input_pp_rank, output_pp_rank=output_pp_rank, diff --git a/src/nanotron/data/chat_dataset.py b/src/nanotron/data/chat_dataset.py index 240b9e8e..79ec9be5 100644 --- a/src/nanotron/data/chat_dataset.py +++ b/src/nanotron/data/chat_dataset.py @@ -17,7 +17,7 @@ class ChatDataset(IterableDataset): """ Chat Dataset for training models with: - 1. Packing + 1. Padding-Free Packing 2. No cross-contamination between packed samples 3. Train on completitions only @@ -44,13 +44,14 @@ def __init__( split: str = "train", dp_rank: int = 0, dp_ranks_size: int = 1, - skip_num_samples: int = None, # TODO Delete, check later comment + skip_num_samples: int = None, # TODO(tj.solergibert) Delete, check later comment seed: int = 1234, ) -> None: - # TODO: Support checkpointing for resuming training. We have to store the number of consumed samples from the dataset (Which is different from the number of steps) and the buffers. + # WARN(tj.solergibert) Currently we DON'T support recovering training from a interruption. Check the following TODOs + # TODO(tj.solergibert) Support checkpointing for resuming training. We have to store the number of consumed samples from the dataset (Which is different from the number of steps) and the BUFFERS. # skip_num_samples will fail, as it's computed with the number of steps and as we are packing sequences we might have consumed MORE samples from the dataset - # TODO: Support interleaving datasets + # TODO(tj.solergibert) Support interleaving datasets self.dataset_path = dataset_path self.chat_tokenizer = ChatTokenizer(tokenizer_name_or_path) @@ -59,24 +60,24 @@ def __init__( self.skip_num_samples = skip_num_samples self.seed = seed - # Load, split and shuffle dataset. Also skip samples if resuming training. + # Load, split and shuffle dataset self.dataset = load_dataset(dataset_path, split=split, streaming=True) self.dataset = split_dataset_by_node(self.dataset, dp_rank, dp_ranks_size) self.dataset = self.dataset.shuffle(seed=seed, buffer_size=10_000) - # TODO delete, just 4 switching the training only on completitions setting + # TODO(tj.solergibert) Delete (debug), just 4 switching the training only on completitions setting if train_on_completions_only: self.create_labels = build_labels_completions_only else: self.create_labels = build_labels - # TODO delete, just 4 switching the remove cross-attention setting + # TODO Delete (debug), just 4 switching the remove cross-attention setting if remove_cross_attention: self.create_position_ids = build_position_ids else: self.create_position_ids = build_position_ids_dummy - # Todo delete (debug), just change the dict keys + # TODO(tj.solergibert) Delete (debug) self.debug_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path) # TODO delete debug self.debug_tokenizer.chat_template = "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['from'] + '<|end_header_id|>\n\n'+ message['value'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>' }}{% endif %}" @@ -90,9 +91,8 @@ def __iter__(self): for sample in iter(self.dataset): tokens, is_completition = self.chat_tokenizer(sample[self.conversation_column_name]) - # TODO assert that tokenized conversations are not longer than max_buffer_token_len? - - # TODO delete (debug). The [:-1] of tokens is because apply chat template doesn't adds eos (NOT eot) token + # TODO(tj.solergibert) Delete (debug). Check if HF apply_chat_template produces the same result as ChatTokenizer + # The [:-1] of tokens is because apply chat template doesn't adds eos (NOT eot) token assert ( self.debug_tokenizer.apply_chat_template(sample["conversations"]) == tokens[:-1] ), f'{self.debug_tokenizer.apply_chat_template(sample["conversations"])}\n\n{tokens[:-1]}' @@ -107,7 +107,7 @@ def __iter__(self): sample_completitions = buffer_is_completition[: -len(tokens)] sample_lengths = buffer_lengths[:-1] - # TODO delete (debug) + # TODO(tj.solergibert) Delete (debug) assert len(sample_tokens) == len(sample_completitions) == sum(sample_lengths) # Reset tokens buffers @@ -115,20 +115,14 @@ def __iter__(self): buffer_is_completition = is_completition.copy() buffer_lengths = [len(tokens)] - # Pad to max_buffer_token_len. Pad token added in ChatTokenizer init if necessary - # sample_tokens.extend( - # [self.chat_tokenizer.tokenizer.pad_token_id] * (max_buffer_token_len - len(sample_tokens)) - # ) - # sample_completitions.extend([False] * (max_buffer_token_len - len(sample_completitions))) - - # TODO delete, just 4 switching the training only on completitions setting - self.create_labels(sample_tokens, sample_completitions) + # TODO(tj.solergibert) Delete (debug), just 4 switching the training only on completitions setting + sample_completitions = self.create_labels(sample_tokens, sample_completitions) - # TODO delete, just 4 switching the remove cross-attention setting + # TODO(tj.solergibert) Delete (debug), just 4 switching the remove cross-attention setting position_ids = self.create_position_ids(sample_lengths, self.sequence_length) - # TODO delete (debug) - # assert len(sample_tokens) == max_buffer_token_len + # TODO(tj.solergibert) Delete (debug) + # assert len(sample_tokens) <= max_buffer_token_len yield { "input_ids": np.array(sample_tokens, dtype=np.int32), @@ -136,4 +130,5 @@ def __iter__(self): "position_ids": position_ids, } + # TODO(tj.solergibert) Change for log_rank (log_rank is problematic with JupyterNB) print("Consumed all samples, dataset is being re-looped.") diff --git a/src/nanotron/data/chat_tokenizer.py b/src/nanotron/data/chat_tokenizer.py index 847a365f..f8ff3b09 100644 --- a/src/nanotron/data/chat_tokenizer.py +++ b/src/nanotron/data/chat_tokenizer.py @@ -59,8 +59,8 @@ def __call__(self, conversation: List[dict]) -> Tuple[List[int], List[bool]]: return tokens, is_completitions def encode_message(self, message: dict) -> Tuple[List[int], List[int]]: - # TODO The "from", "value", "gpt" keys are form SlimOrca Dataset. Llama3 uses another ones. We should stick to a - # single format and document it properly rather than supporting multiple formats, as each one will need a different + # NOTE(tj.solergibert) The "from", "value", "gpt" keys are from SlimOrca Dataset. Llama3 HF Pretrained tokenizer uses another ones. We should stick to a + # single format and document it properly rather than supporting multiple formats, as each DATASET will need a different # ChatTokenizer and the idea is that all Datasets share the same ChatTokenizer # Encode header diff --git a/src/nanotron/data/collator.py b/src/nanotron/data/collator.py index 92138fe4..ea68b8b8 100644 --- a/src/nanotron/data/collator.py +++ b/src/nanotron/data/collator.py @@ -80,43 +80,36 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni return result -# TODO Find a more elegant way. e.g. extend instead of append. OK, so no extend -# We could compute position ids after tokenizing each sample but we will still miss the last length of the padding tokens +# TODO(tj.solergibert) After "Beta", delete all the functs except `build_position_ids` and move `build_position_ids` to chat_dataset.py def build_position_ids(lengths, sequence_length) -> np.array: position_ids = [list(range(length)) for length in lengths] # Create position ids list - # position_ids.append([0] * (sequence_length - sum(lengths))) # Append position_ids of the padding tokens return np.array([x for xs in position_ids for x in xs], dtype=np.int32) # Flatten list of position ids -# TODO delete, just 4 switching the remove cross-attention setting +# TODO(tj.solergibert) Delete (debug), just 4 switching the remove cross-attention setting def build_position_ids_dummy(lengths, sequence_length) -> np.array: - return np.array(list(range(sequence_length)), dtype=np.int32) # TODO numpy arange + return np.array(list(range(sum(lengths))), dtype=np.int32) # TODO numpy arange -# TODO delete, just 4 switching the training only on completitions setting. This will be in the __iter__ method instead of a function +# TODO(tj.solergibert) Delete (debug), just 4 switching the training only on completitions setting. def build_labels_completions_only(input_ids, is_completitions): - labels = np.where( - is_completitions, input_ids, -100 - ) # Mask tokens that don't belong to the completitions by the Assistant - return np.array(labels[1:], dtype=np.int32) + return is_completitions -# TODO delete, just 4 switching the training only on completitions setting +# TODO(tj.solergibert) Delete (debug), just 4 switching the training only on completitions setting def build_labels(input_ids, is_completitions): - return np.array(input_ids[1:], dtype=np.int32) + return [True for _ in range(len(is_completitions))] @dataclass -class NanoChatDataCollatorForSFT: # TODO(tj.solergibert) Find a better name +class DataCollatorForSFT: """ Data collator used with Chat Dataset. - - sequence_length: Sequence length of each sample in the batch - input_pp_rank: Discards last input id token - output_pp_rank: Discards first label id token - other pp ranks: Don't have data. Instead, we use `TensorPointer` to point to the rank having the data. """ - sequence_length: int input_pp_rank: int output_pp_rank: int parallel_context: ParallelContext @@ -137,7 +130,6 @@ def __call__(self, examples: List[Dict[str, List[int]]]) -> Dict[str, Union[torc "label_mask": TensorPointer(group_rank=self.output_pp_rank), } - # TODO(tj.solergibert) Clean this, as we are flattening the batch there is no necessity for vstack but we need the batch dimension too input_ids = np.vstack([examples[i]["input_ids"] for i in range(len(examples))]) # (b, s) is_completitions = np.vstack([examples[i]["is_completitions"] for i in range(len(examples))]) # (b, s) position_ids = np.vstack([examples[i]["position_ids"] for i in range(len(examples))]) # (b, s) diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py index f63237ad..2136cfcc 100644 --- a/src/nanotron/data/dataloader_builder.py +++ b/src/nanotron/data/dataloader_builder.py @@ -1,6 +1,6 @@ import nanotron.distributed as dist from nanotron import logging -from nanotron.data.collator import NanoChatDataCollatorForSFT, NanosetDataCollatorForCLM +from nanotron.data.collator import DataCollatorForSFT, NanosetDataCollatorForCLM from nanotron.dataloader import ( EmptyInfiniteDataset, get_dataloader_worker_init, @@ -66,7 +66,6 @@ def build_nanoset_dataloader( def build_chat_dataloader( dataset, - sequence_length: int, parallel_context: ParallelContext, input_pp_rank: int, output_pp_rank: int, @@ -78,8 +77,7 @@ def build_chat_dataloader( dataset_length = 1_000_000 # len(dataset) TODO find a more elegant way to specify this dummy dataset dataset = EmptyInfiniteDataset(length=dataset_length) - data_collator = NanoChatDataCollatorForSFT( - sequence_length=sequence_length, + data_collator = DataCollatorForSFT( input_pp_rank=input_pp_rank, output_pp_rank=output_pp_rank, parallel_context=parallel_context, diff --git a/src/nanotron/models/llama_sft.py b/src/nanotron/models/llama_sft.py index d8afb7e4..35df7cab 100644 --- a/src/nanotron/models/llama_sft.py +++ b/src/nanotron/models/llama_sft.py @@ -45,8 +45,17 @@ logger = logging.get_logger(__name__) +#################################################################################### +############################## SFT Auxiliary functions ############################## +#################################################################################### +## Copied RoPE functions from HF Transformers. Nanotron ships with FlashAttention ## +## RoPEs written in triton which are considerbly faster BUT currently they don't ## +## support the poisiton ids necessary for the cross attention feature. The cos & ## +## sin are created in the embedding layer and propagated through the pipeline so ## +## we don't have a RoPE layer in each and every decoder layer. Then in each and ## +## every decoder layer we apply the cos & sin to Q & K with `apply_rotary_pos_emb`## +#################################################################################### -####### # NOTE(tj.solergibert) Copied from https://github.com/huggingface/transformers/blob/81233c069c166af033794134bd8888783ac49ebe/src/transformers/modeling_rope_utils.py#L29 def _compute_default_rope_parameters( config: LlamaConfig, @@ -81,10 +90,8 @@ def __init__( super().__init__() self.config = config - self.inv_freq = _compute_default_rope_parameters(self.config) # NOTE(tj.solergibert) shape: 64 , 1.0 - # print(inv_freq.dtype) - # self.register_buffer("inv_freq", inv_freq, persistent=False) - # print(self.inv_freq.dtype) # TODO(tj.solergibert) register_buffer casts to bf16!!!! + self.inv_freq = _compute_default_rope_parameters(self.config) + # self.register_buffer("inv_freq", inv_freq, persistent=False) # NOTE(tj.solergibert) register_buffer casts to bf16! # self.original_inv_freq = inv_freq @torch.no_grad() @@ -130,10 +137,10 @@ def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1): Returns: tuple (torch.Tensor) comprising of the query and key tensors rotated using the Rotary Position Embedding. """ - cos = cos.unsqueeze(unsqueeze_dim) # NOTE(tj.solergibert) [1, 70, 128] --> [1, 1, 70, 128] - sin = sin.unsqueeze(unsqueeze_dim) # NOTE(tj.solergibert) - q_embed = (q * cos) + (rotate_half(q) * sin) # NOTE(tj.solergibert) [1, 32, 70, 128] - k_embed = (k * cos) + (rotate_half(k) * sin) # NOTE(tj.solergibert) [1, 8, 70, 128] + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed @@ -152,7 +159,7 @@ def prepare_varlen_args(position_ids): return cu_seqlens, max_seqlen_in_batch -####### +#################################################################################### class GLUActivation(nn.Module): @@ -329,29 +336,6 @@ def forward( .contiguous() ) # [3, batch_size, seq_length, n_local_q_heads, d_qk] - # Training case OLD - # Apply rotary embeddings to query/key states - # NOTE: The layout is different from models/llama.py which is [batch_size, num_heads, seq_length, d_qk] - # Here it is, [batch_size, seq_length, num_heads, d_qk] - # [2, batch_size, seq_length, num_heads, d_qk] - # key_value_states = torch.cat([key_states.unsqueeze(0), value_states.unsqueeze(0)], dim=0) - # [batch_size, seq_length, 2, num_heads, d_qk] - # key_value_states = key_value_states.permute(1, 2, 0, 3, 4).contiguous() - # query_states, key_value_states = self.flash_rotary_embedding(query_states, kv=key_value_states) - # [batch_size, seq_length, num_heads, d_qk] - # key_states, value_states = torch.split(key_value_states, 1, dim=2) - - # TODO(tj.solergibert) ver si esto sirve de algo o no!!!!! - # kv_length = key_states.shape[1] - # key_states = key_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_qk) - # value_states = value_states.view(batch_size, kv_length, self.n_local_kv_heads, self.d_v) - - # attention_output = self.attention( - # query_states=query_states, - # key_states=key_states, - # value_states=value_states, - # ) - # TODO(tj.solergibert) Apply RoPE embeddings WITHOUT too many transpose... query_states, key_states = query_states.transpose(1, 2), key_states.transpose(1, 2) # Apply RoPE @@ -366,21 +350,19 @@ def forward( value_states = value_states.view(-1, value_states.size(-2), value_states.size(-1)) attention_output = flash_attn_varlen_func( - query_states, # NOTE(tj.solergibert) Shape: [70, 32, 128] - key_states, # NOTE(tj.solergibert) Shape: [70, 8, 128] - value_states, # NOTE(tj.solergibert) Shape: [70, 8, 128] - cu_seqlens_q=cu_seqlens, # NOTE(tj.solergibert) Shape: Tensor, [14] - cu_seqlens_k=cu_seqlens, # NOTE(tj.solergibert) Shape: Tensor, [14] - max_seqlen_q=max_seqlen_in_batch, # NOTE(tj.solergibert) Shape: Tensor, [1] Just 1 element with the longer sequence in batch. In the HF Transformers dummy test is 7 - max_seqlen_k=max_seqlen_in_batch, # NOTE(tj.solergibert) Shape: Tensor, [1] Just 1 element with the longer sequence in batch. In the HF Transformers dummy test is 7 - causal=True, # NOTE(tj.solergibert) True + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens, + cu_seqlens_k=cu_seqlens, + max_seqlen_q=max_seqlen_in_batch, + max_seqlen_k=max_seqlen_in_batch, + causal=True, ) # NOTE(tj.solergibert) Returns out: (total, nheads, headdim). attention_output = ( attention_output.contiguous() - .view( - batch_size, q_length, self.n_local_q_heads * self.d_v - ) # TODO(tj.solergibert) q_length to -1. Also take care of batch size will be always 1 + .view(batch_size, q_length, self.n_local_q_heads * self.d_v) .transpose(0, 1) # TODO(tj.solergibert) View is necessary, but contiguous? ) output = self.o_proj(attention_output) @@ -451,27 +433,13 @@ def __init__(self, tp_pg: dist.ProcessGroup, config: LlamaConfig, parallel_confi self.position_embedding = LlamaRotaryEmbedding(config=config) def forward(self, input_ids: torch.Tensor, position_ids: torch.Tensor): # [batch_size, seq_length] - # TODO(tj.solergibert) Delete this store stuff ################ - store = self.get_local_store() - if store is not None: - if "past_length" in store: - store["past_length"] - else: - torch.zeros(1, dtype=torch.long, device=input_ids.device).expand(input_ids.shape[0]) - - # cumsum_mask = input_mask.cumsum(-1, dtype=torch.long) - # Store new past_length in store - # store["past_length"] = past_length + cumsum_mask[:, -1] - ################################################################ - # Format input in `[seq_length, batch_size]` to support high TP with low batch_size input_ids = input_ids.transpose(0, 1) input_embeds = self.token_embedding(input_ids) # NOTE(tj.solergibert) We create the cos & sin and propagate them through the pipeline so we # don't have to create the LlamaRotaryEmbedding layer in each and every decoder layer - # We will still send the position ids for the varlen, but we will try to delete it. Computing them from - # the position ids it's not very expensive AND we keep a tensor with constant shape + # We will still send the position ids for the varlen cos, sin = self.position_embedding( input_embeds, position_ids ) # TODO(tj.solergibert) We just need from inputs_ids the device type @@ -631,8 +599,6 @@ def get_flops_per_sec(self, iteration_time_in_sec, sequence_length, global_batch return model_flops_per_s, hardware_flops_per_s -# TODO(tj.solergibert) OJO con la label mask!!! Tal vez necesitamos hacer algo con los input ids!! -# TODO(tj.solergibert) A pero espera, si esta a -100 ya basta no? Habria que comprobar eso con la loss esa rara que hacemos, mierda!! @torch.jit.script def masked_mean(loss, label_mask, dtype): # type: (Tensor, Tensor, torch.dtype) -> Tensor @@ -695,15 +661,18 @@ def forward( label_ids: Union[torch.Tensor, TensorPointer], label_mask: Union[torch.Tensor, TensorPointer], ) -> Dict[str, Union[torch.Tensor, TensorPointer]]: + sharded_logits = self.model( input_ids=input_ids, position_ids=position_ids, ) + loss = self.loss( sharded_logits=sharded_logits, label_ids=label_ids, label_mask=label_mask, )["loss"] + return {"loss": loss} @torch.no_grad() diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 9984b881..14520e33 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -26,6 +26,7 @@ from nanotron import distributed as dist from nanotron import logging from nanotron.config import ( + ChatDatasetsArgs, Config, DatasetStageArgs, ExistingCheckpointInit, @@ -57,6 +58,7 @@ from nanotron.models import NanotronModel, build_model from nanotron.models.base import check_model_has_grad from nanotron.models.llama import LlamaForTraining +from nanotron.models.llama_sft import LlamaForSFT from nanotron.models.starcoder2 import Starcoder2ForTraining from nanotron.optim.clip_grads import clip_grad_norm from nanotron.parallel import ParallelContext @@ -102,6 +104,7 @@ CONFIG_TO_MODEL_CLASS = { "LlamaConfig": LlamaForTraining, + "LlamaConfigForSFT": LlamaForSFT, "Starcoder2Config": Starcoder2ForTraining, } @@ -670,6 +673,10 @@ def init_model(self) -> Union[NanotronModel, DistributedDataParallel]: def _init_model_instance(self) -> NanotronModel: model_config_cls = self.model_config.__class__.__name__ + + if model_config_cls == "LlamaConfig" and isinstance(self.config.data_stages[0].data.dataset, ChatDatasetsArgs): + model_config_cls = "LlamaConfigForSFT" + assert ( model_config_cls in CONFIG_TO_MODEL_CLASS ), f"Unsupported model config {model_config_cls}. Only {CONFIG_TO_MODEL_CLASS.keys()} are supported" diff --git a/tools/check_sft.py b/tools/check_sft.py index 63c4daab..6e80b883 100644 --- a/tools/check_sft.py +++ b/tools/check_sft.py @@ -27,12 +27,14 @@ TP = 1 # NOTE(tj.solergibert) How many K-first tokens must match -TOPK_MATCH = 3 +# NOTE(tj.solergibert) After running lot's of tests, MOST (If not 100%) of the times the most probable token matches. Sometimes there are slightly differences in the next tokens, +# usually when the first token has a very high probability and the rest are left with < 1e-2. +TOPK_MATCH = 1 BATCHES = 15 -def build_labels_completions_only(input_ids, is_completitions): +def hf_build_labels_completions_only(input_ids, is_completitions): labels = np.where( is_completitions, input_ids, -100 ) # Mask tokens that don't belong to the completitions by the Assistant @@ -197,10 +199,10 @@ def main(): # Create ChatDataloaders train_dataset = ChatDataset( - dataset_path="Open-Orca/SlimOrca", + dataset_path="Magpie-Align/Magpie-Pro-300K-Filtered", # "Open-Orca/SlimOrca", tokenizer_name_or_path=PATH_TO_LLAMA, sequence_length=2048, - train_on_completions_only=True, + train_on_completions_only=False, remove_cross_attention=True, split="train", conversation_column_name="conversations", @@ -211,7 +213,6 @@ def main(): # Prepare dataloader train_dataloader = build_chat_dataloader( dataset=train_dataset, - sequence_length=2048, parallel_context=parallel_context, input_pp_rank=0, output_pp_rank=0, @@ -240,8 +241,9 @@ def main(): # This will always fail! We aren't performing the SAME operations. Nanotron packs QKV matrices, MLP & LayerNorm is different. So we don't have to focus on MATCHING LOGITS BUT GENERATIONS # assert_close(output_hf.logits, output_nanotron.transpose(0, 1), rtol=1e-1, atol=1e-1) - predicted_tokens = [37, 92, 125, 423, 744, 912, 1298] + predicted_tokens = [62, 92, 125, 425, 744, 912, 1298] for predicted_token in predicted_tokens: + print(predicted_token) next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1) hf_topk_next_tokens = torch.topk(next_tokens_hf, 10) @@ -261,7 +263,7 @@ def main(): )["loss"] # Creating labels_ids for HF loss computation - hf_labels = build_labels_completions_only( + hf_labels = hf_build_labels_completions_only( batch["label_ids"].flatten().tolist(), batch["label_mask"].flatten().tolist() ) shift_logits = output_hf.logits.contiguous() From a8f979d03c11cb76512ba764238f48b1a6a0a5eb Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Tue, 30 Jul 2024 16:55:18 +0000 Subject: [PATCH 6/9] RCP Working --- examples/config_llama8b_sft.yaml | 23 ++++++++++++----------- src/nanotron/trainer.py | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/examples/config_llama8b_sft.yaml b/examples/config_llama8b_sft.yaml index 61dd8222..010fc5e2 100644 --- a/examples/config_llama8b_sft.yaml +++ b/examples/config_llama8b_sft.yaml @@ -1,6 +1,6 @@ checkpoints: checkpoint_interval: 1000 - checkpoints_path: /mloscratch/homes/solergib/converter/nanotron/checkpoints + checkpoints_path: checkpoints/ checkpoints_path_is_shared_file_system: false resume_checkpoint_path: null save_initial_state: false @@ -20,7 +20,7 @@ general: benchmark_csv_path: null consumed_train_samples: null ignore_sanity_checks: true - project: Chat + project: SFT run: Llama3-8B seed: 42 step: null @@ -33,25 +33,26 @@ model: ddp_bucket_cap_mb: 25 dtype: bfloat16 init_method: - std: 0.025 + path: /mloscratch/homes/solergib/converter/nanotron/nanotron_checkpoints/NanotronLlama38B make_vocab_size_divisible_by: 1 model_config: - bos_token_id: 128000 - eos_token_id: 128001 + bos_token_id: 1 + eos_token_id: 2 hidden_act: silu hidden_size: 4096 initializer_range: 0.02 intermediate_size: 14336 is_llama_config: true max_position_embeddings: 4096 + num_hidden_layers: 32 num_attention_heads: 32 - num_hidden_layers: 4 num_key_value_heads: 8 pad_token_id: null pretraining_tp: 1 - rms_norm_eps: 1.0e-05 - rope_scaling: null + rope_interleaved: false rope_theta: 500000.0 + rms_norm_eps: 1.0e-06 + rope_scaling: null tie_word_embeddings: false use_cache: true vocab_size: 128256 @@ -59,7 +60,7 @@ optimizer: accumulate_grad_in_fp32: true clip_grad: 1.0 learning_rate_scheduler: - learning_rate: 0.0003 + learning_rate: 2.0e-5 lr_decay_starting_step: null lr_decay_steps: 98 lr_decay_style: cosine @@ -79,7 +80,7 @@ parallelism: expert_parallel_size: 1 pp: 1 pp_engine: 1f1b - tp: 1 + tp: 4 tp_linear_async_communication: false tp_mode: ALL_REDUCE profiler: null @@ -93,5 +94,5 @@ tokens: limit_val_batches: 0 micro_batch_size: 3 sequence_length: 4096 - train_steps: 100 + train_steps: 250 val_check_interval: -1 diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 14520e33..c4b1d1e5 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -255,6 +255,20 @@ def __init__( # NOTE: the dataloader currently in use for the current training stage self.current_dataloader: Optional[DataLoader] = None + # NOTE(tj.solergibert) Flatten batch size in SFT training + if isinstance(self.config.data_stages[0].data.dataset, ChatDatasetsArgs) and self.micro_batch_size != 1: + self.sequence_length = self.micro_batch_size * self.config.tokens.sequence_length + self.micro_batch_size = 1 + self.global_batch_size = ( + self.micro_batch_size * self.n_micro_batches_per_batch * self.parallel_context.dp_pg.size() + ) + log_rank( + f"Flattening Batch dimension for SFT training. global_batch_size:{self.global_batch_size}, micro_batch_size: {self.micro_batch_size}, sequence_length: {self.sequence_length}", + logger=logger, + level=logging.INFO, + rank=0, + ) + self.post_init() def pre_init(self): From c026422e5bf0bc1086c039e65d8f7bbe75dc9728 Mon Sep 17 00:00:00 2001 From: Antoni-Joan Solergibert Date: Tue, 30 Jul 2024 22:42:09 +0200 Subject: [PATCH 7/9] Added todi scripts --- examples/config_llama8b_sft.yaml | 18 ++++---- pyproject.toml | 1 + src/nanotron/trainer.py | 2 +- tools/todi/Dockerfile | 15 +++++++ tools/todi/nanotron_sft.toml | 15 +++++++ tools/todi/submit_nanotron_sft.sh | 71 +++++++++++++++++++++++++++++++ 6 files changed, 112 insertions(+), 10 deletions(-) create mode 100644 tools/todi/Dockerfile create mode 100644 tools/todi/nanotron_sft.toml create mode 100644 tools/todi/submit_nanotron_sft.sh diff --git a/examples/config_llama8b_sft.yaml b/examples/config_llama8b_sft.yaml index 010fc5e2..cf6e2db7 100644 --- a/examples/config_llama8b_sft.yaml +++ b/examples/config_llama8b_sft.yaml @@ -20,7 +20,7 @@ general: benchmark_csv_path: null consumed_train_samples: null ignore_sanity_checks: true - project: SFT + project: SFT-Todi run: Llama3-8B seed: 42 step: null @@ -33,17 +33,17 @@ model: ddp_bucket_cap_mb: 25 dtype: bfloat16 init_method: - path: /mloscratch/homes/solergib/converter/nanotron/nanotron_checkpoints/NanotronLlama38B + path: /store/swissai/a06/models/nanotron_checkpoints/Meta-Llama-3.1-8B-Instruct make_vocab_size_divisible_by: 1 model_config: - bos_token_id: 1 - eos_token_id: 2 + bos_token_id: 128000 + eos_token_id: 128001 hidden_act: silu hidden_size: 4096 initializer_range: 0.02 intermediate_size: 14336 is_llama_config: true - max_position_embeddings: 4096 + max_position_embeddings: 131072 num_hidden_layers: 32 num_attention_heads: 32 num_key_value_heads: 8 @@ -51,7 +51,7 @@ model: pretraining_tp: 1 rope_interleaved: false rope_theta: 500000.0 - rms_norm_eps: 1.0e-06 + rms_norm_eps: 1.0e-05 rope_scaling: null tie_word_embeddings: false use_cache: true @@ -76,7 +76,7 @@ optimizer: weight_decay: 0.01 zero_stage: 0 parallelism: - dp: 1 + dp: 4 expert_parallel_size: 1 pp: 1 pp_engine: 1f1b @@ -86,13 +86,13 @@ parallelism: profiler: null tokenizer: tokenizer_max_length: null - tokenizer_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct + tokenizer_name_or_path: /store/swissai/a06/models/nanotron_checkpoints/Meta-Llama-3.1-8B-Instruct tokenizer_revision: null tokens: batch_accumulation_per_replica: 1 limit_test_batches: 0 limit_val_batches: 0 - micro_batch_size: 3 + micro_batch_size: 4 sequence_length: 4096 train_steps: 250 val_check_interval: -1 diff --git a/pyproject.toml b/pyproject.toml index 6a0cfb83..4810a60a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ dependencies = [ "safetensors", "dacite", "tqdm", + "wandb", ] [tool.setuptools.packages.find] diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index c4b1d1e5..3000ae22 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -263,7 +263,7 @@ def __init__( self.micro_batch_size * self.n_micro_batches_per_batch * self.parallel_context.dp_pg.size() ) log_rank( - f"Flattening Batch dimension for SFT training. global_batch_size:{self.global_batch_size}, micro_batch_size: {self.micro_batch_size}, sequence_length: {self.sequence_length}", + f"Flattening Batch dimension for SFT training. global_batch_size: {self.global_batch_size}, micro_batch_size: {self.micro_batch_size}, sequence_length: {self.sequence_length}", logger=logger, level=logging.INFO, rank=0, diff --git a/tools/todi/Dockerfile b/tools/todi/Dockerfile new file mode 100644 index 00000000..611ddba0 --- /dev/null +++ b/tools/todi/Dockerfile @@ -0,0 +1,15 @@ +FROM nvcr.io/nvidia/pytorch:24.05-py3 + +# Setup +RUN apt-get update && apt-get install python3-pip python3-venv -y +RUN pip install --upgrade pip setuptools==69.5.1 + +RUN pip install flash-attn==2.5.8 --no-build-isolation + +COPY nanotron/ /workspace/nanotron +WORKDIR /workspace/nanotron +RUN pip install -e '.[nanosets]' + +# Instructions: +# 1. Build image: podman build -f /users/asolergi/SFT/nanotron/tools/todi/Dockerfile -t nanotron_sft /users/asolergi/SFT/ #### NOTE In /users/asolergi/SFT/ we have nanotron/ (/users/asolergi/SFT/nanotron) +# 2. Export image: enroot import -o /store/swissai/a06/.sft_toni/nanotron_sft.sqsh podman://localhost/nanotron_sft:latest diff --git a/tools/todi/nanotron_sft.toml b/tools/todi/nanotron_sft.toml new file mode 100644 index 00000000..ffa30484 --- /dev/null +++ b/tools/todi/nanotron_sft.toml @@ -0,0 +1,15 @@ +image = "/store/swissai/a06/.sft_toni/nanotron_sft.sqsh" +mounts = [ +"/capstor", +"/users", +"/store", +] +workdir = "/workspace/nanotron/" + +[env] +FI_CXI_DISABLE_HOST_REGISTER = "1" +FI_MR_CACHE_MONITOR = "userfaultfd" + +[annotations.com.hooks] +aws_ofi_nccl.enabled = "true" +aws_ofi_nccl.variant = "cuda12" diff --git a/tools/todi/submit_nanotron_sft.sh b/tools/todi/submit_nanotron_sft.sh new file mode 100644 index 00000000..13a6696f --- /dev/null +++ b/tools/todi/submit_nanotron_sft.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +#SBATCH --job-name nanotron_sft +#SBATCH --chdir /users/asolergi/SFT/nanotron # TODO Set this path!!! +#SBATCH --output reports/R-%x.%j.out # Make sure this paths exists, otherwise the job will fail silently +#SBATCH --error reports/R-%x.%j.err # Make sure this paths exists, otherwise the job will fail silently +#SBATCH --nodes 4 # number of Nodes +#SBATCH --ntasks-per-node 1 # number of MP tasks. IMPORTANT: torchrun represents just 1 Slurm task +#SBATCH --gres gpu:4 # Number of GPUs +#SBATCH --cpus-per-task 288 # number of CPUs per task. +#SBATCH --time 11:59:59 # maximum execution time (DD-HH:MM:SS). Mandatory field in MN5 +#SBATCH --reservation todi +#SBATCH --environment /store/swissai/a06/.sft_toni/nanotron_sft.toml +#SBATCH --contiguous + +echo "START TIME: $(date)" + +# auto-fail on any errors in this script +set -eo pipefail + +# logging script's variables/commands for future debug needs +set -x + +###################### +### Set environment ### +###################### +GPUS_PER_NODE=4 +echo "NODES: $SLURM_NNODES" +###################### + +###################### +#### Set network ##### +###################### +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +###################### + +# note that we don't want to interpolate `\$SLURM_PROCID` till `srun` since otherwise all nodes will get +# 0 and the launcher will hang +# +# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time +LAUNCHER="torchrun \ + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $SLURM_NNODES \ + --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ + --rdzv_backend c10d \ + --max_restarts 0 \ + --tee 3 \ + --node_rank ${SLURM_PROCID} \ + " + +PYTHON_FILE=/workspace/nanotron/run_train.py +NANOTRON_CONFIG=/users/asolergi/SFT/nanotron/examples/config_llama8b_sft.yaml # TODO Set this path!!! + +export CMD="CUDA_DEVICE_MAX_CONNECTIONS=1 $LAUNCHER $PYTHON_FILE --config $NANOTRON_CONFIG" + +echo $CMD + +# srun error handling: +# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks +SRUN_ARGS=" \ + --cpus-per-task $SLURM_CPUS_PER_TASK \ + --jobid $SLURM_JOB_ID \ + --wait 60 \ + --unbuffered \ + " + +# bash -c is needed for the delayed interpolation of env vars to work +srun $SRUN_ARGS bash -c "$CMD" + +echo "END TIME: $(date)" From 38f3815108665424e62387a8b1798cb6a452e706 Mon Sep 17 00:00:00 2001 From: tj-solergibert Date: Fri, 2 Aug 2024 07:19:35 +0000 Subject: [PATCH 8/9] Added SFT docs --- convert_hf_nanotron.ipynb | 1344 ------------------------------------- docs/sft.md | 56 ++ docs/sft_feature1.png | Bin 0 -> 17109 bytes docs/sft_feature2.png | Bin 0 -> 31791 bytes docs/sft_feature3.png | Bin 0 -> 27276 bytes 5 files changed, 56 insertions(+), 1344 deletions(-) delete mode 100644 convert_hf_nanotron.ipynb create mode 100644 docs/sft.md create mode 100644 docs/sft_feature1.png create mode 100644 docs/sft_feature2.png create mode 100644 docs/sft_feature3.png diff --git a/convert_hf_nanotron.ipynb b/convert_hf_nanotron.ipynb deleted file mode 100644 index 34605e00..00000000 --- a/convert_hf_nanotron.ipynb +++ /dev/null @@ -1,1344 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "from torch.testing import assert_close\n", - "\n", - "import os\n", - "\n", - "dtype = torch.bfloat16\n", - "device = torch.device(\"cuda\")\n", - "\n", - "os.environ[\"WORLD_SIZE\"] = \"1\"\n", - "os.environ[\"RANK\"] = \"0\"\n", - "os.environ[\"MASTER_ADDR\"] = \"0.0.0.0\"\n", - "os.environ[\"MASTER_PORT\"] = \"6000\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "PATH_TO_LLAMA = \"/mloscratch/homes/solergib/models/Meta-Llama-3-8B-Instruct\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/solergib/.local/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n", - "Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 7.36it/s]\n" - ] - } - ], - "source": [ - "from transformers import AutoModelForCausalLM\n", - "hf_model = AutoModelForCausalLM.from_pretrained(PATH_TO_LLAMA, torch_dtype=dtype, attn_implementation=\"flash_attention_2\").to(device)\n", - "# print(hf_model)\n", - "# print(hf_model.config)\n", - "#print(hf_model.model.rotary_emb.ori_inv_freq.dtype)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "LlamaConfig {\n", - " \"architectures\": [\n", - " \"LlamaForCausalLM\"\n", - " ],\n", - " \"attention_bias\": false,\n", - " \"attention_dropout\": 0.0,\n", - " \"bos_token_id\": 128000,\n", - " \"eos_token_id\": 128001,\n", - " \"hidden_act\": \"silu\",\n", - " \"hidden_size\": 4096,\n", - " \"initializer_range\": 0.02,\n", - " \"intermediate_size\": 14336,\n", - " \"max_position_embeddings\": 8192,\n", - " \"mlp_bias\": false,\n", - " \"model_type\": \"llama\",\n", - " \"num_attention_heads\": 32,\n", - " \"num_hidden_layers\": 32,\n", - " \"num_key_value_heads\": 8,\n", - " \"pretraining_tp\": 1,\n", - " \"rms_norm_eps\": 1e-05,\n", - " \"rope_scaling\": null,\n", - " \"rope_theta\": 500000.0,\n", - " \"tie_word_embeddings\": false,\n", - " \"torch_dtype\": \"bfloat16\",\n", - " \"transformers_version\": \"4.44.0.dev0\",\n", - " \"use_cache\": true,\n", - " \"vocab_size\": 128256\n", - "}\n", - "\n" - ] - } - ], - "source": [ - "from transformers import LlamaConfig\n", - "hf_config = LlamaConfig.from_pretrained(PATH_TO_LLAMA)\n", - "print(hf_config)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from nanotron.config import ParallelismArgs\n", - "from nanotron.parallel import ParallelContext\n", - "from nanotron.parallel.pipeline_parallel.engine import AllForwardAllBackwardPipelineEngine\n", - "from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode\n", - "\n", - "DP = 1\n", - "PP = 1\n", - "TP = 1\n", - "\n", - "parallel_config = ParallelismArgs(\n", - " dp=DP,\n", - " pp=PP,\n", - " tp=TP,\n", - " pp_engine=AllForwardAllBackwardPipelineEngine(),\n", - " tp_mode=TensorParallelLinearMode.ALL_REDUCE,\n", - " tp_linear_async_communication=False,\n", - ")\n", - "assert (\n", - " parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE\n", - " and parallel_config.tp_linear_async_communication is False\n", - ")\n", - "\n", - "parallel_context = ParallelContext(\n", - " data_parallel_size=parallel_config.dp,\n", - " pipeline_parallel_size=parallel_config.pp,\n", - " tensor_parallel_size=parallel_config.tp,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from nanotron.config.models_config import LlamaConfig as LlamaConfigNanotron\n", - "\n", - "nanotron_config = LlamaConfigNanotron(\n", - " bos_token_id=hf_config.bos_token_id,\n", - " eos_token_id=hf_config.eos_token_id,\n", - " hidden_act=hf_config.hidden_act,\n", - " hidden_size=hf_config.hidden_size,\n", - " initializer_range=hf_config.initializer_range,\n", - " intermediate_size=hf_config.intermediate_size,\n", - " is_llama_config=True,\n", - " max_position_embeddings=hf_config.max_position_embeddings,\n", - " num_attention_heads=hf_config.num_attention_heads,\n", - " num_hidden_layers=hf_config.num_hidden_layers,\n", - " num_key_value_heads=hf_config.num_key_value_heads,\n", - " pad_token_id=None,\n", - " pretraining_tp=hf_config.pretraining_tp,\n", - " rms_norm_eps=hf_config.rms_norm_eps,\n", - " rope_scaling=hf_config.rope_scaling,\n", - " rope_theta=hf_config.rope_theta,\n", - " rope_interleaved=False,\n", - " tie_word_embeddings=hf_config.tie_word_embeddings,\n", - " use_cache=hf_config.use_cache,\n", - " vocab_size=hf_config.vocab_size,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.float32\n" - ] - } - ], - "source": [ - "from nanotron.models.llama_sft import LlamaForSFT\n", - "from nanotron.models import build_model\n", - "\n", - "nanotron_model = build_model(\n", - " model_builder=lambda: LlamaForSFT(\n", - " config=nanotron_config,\n", - " parallel_context=parallel_context,\n", - " parallel_config=parallel_config,\n", - " random_states=None,\n", - " ),\n", - " parallel_context=parallel_context,\n", - " dtype=dtype,\n", - " device=device,\n", - ")\n", - "# print(nanotron_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from nanotron.trainer import mark_tied_parameters\n", - "\n", - "mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ShardedInfo(global_ranks=(0,), local_global_slices_pairs=(SlicesPair(local_slices=(slice(None, None, None), slice(None, None, None)), global_slices=(slice(0, 128256, None), slice(None, None, None))),), unsharded_shape=(128256, 4096))" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.get_sharded_info()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.is_tied" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Final script\n", - "# TODO Añadir variables de TP para splitear los parametros de las layers de HF\n", - "# TODO Cargar modelo HF en cpu y copiar desde ahi\n", - "\n", - "\n", - "# Token embeddings\n", - "assert nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.shape == hf_model.model.embed_tokens.weight.shape\n", - "\n", - "with torch.no_grad():\n", - " nanotron_model.model.token_position_embeddings.pp_block.token_embedding.weight.copy_(hf_model.model.embed_tokens.weight)# = hf_model.model.embed_tokens.weight.data\n", - "\n", - "# Decoder layers\n", - "for i in range(nanotron_config.num_hidden_layers):\n", - " # Input layer norm\n", - " assert hf_model.model.layers[i].input_layernorm.weight.shape == nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.shape\n", - " with torch.no_grad():\n", - " nanotron_model.model.decoder[i].pp_block.input_layernorm.weight.copy_(hf_model.model.layers[i].input_layernorm.weight)# = hf_model.model.layers[i].input_layernorm.weight\n", - " # Self attn\n", - " ## QKV\n", - " tmp_qkv_proj = torch.cat([\n", - " hf_model.model.layers[i].self_attn.q_proj.weight,\n", - " hf_model.model.layers[i].self_attn.k_proj.weight,\n", - " hf_model.model.layers[i].self_attn.v_proj.weight\n", - " ], dim = 0) \n", - " assert tmp_qkv_proj.shape == nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.shape\n", - " with torch.no_grad():\n", - " nanotron_model.model.decoder[i].pp_block.attn.qkv_proj.weight.copy_(tmp_qkv_proj)# = tmp_qkv_proj # torch.nn.Parameter(tmp_qkv_proj)\n", - " \n", - " ## O\n", - " assert hf_model.model.layers[i].self_attn.o_proj.weight.shape == nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.shape\n", - " with torch.no_grad():\n", - " nanotron_model.model.decoder[i].pp_block.attn.o_proj.weight.copy_(hf_model.model.layers[i].self_attn.o_proj.weight)# = hf_model.model.layers[i].self_attn.o_proj.weight\n", - " # MLP\n", - " ## Gate Up Proj\n", - " tmp_gate_up_proj = torch.cat([\n", - " hf_model.model.layers[i].mlp.gate_proj.weight,\n", - " hf_model.model.layers[i].mlp.up_proj.weight,\n", - " ], dim = 0)\n", - "\n", - " assert tmp_gate_up_proj.shape == nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.shape\n", - " with torch.no_grad():\n", - " nanotron_model.model.decoder[i].pp_block.mlp.gate_up_proj.weight.copy_(tmp_gate_up_proj)# = tmp_gate_up_proj\n", - " ## Down Proj\n", - " assert hf_model.model.layers[i].mlp.down_proj.weight.shape == nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.shape\n", - " with torch.no_grad():\n", - " nanotron_model.model.decoder[i].pp_block.mlp.down_proj.weight.copy_(hf_model.model.layers[i].mlp.down_proj.weight)# = hf_model.model.layers[i].mlp.down_proj.weight\n", - "\n", - "\n", - " # Post attn layer norm\n", - " assert hf_model.model.layers[i].post_attention_layernorm.weight.shape == nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.shape\n", - " with torch.no_grad():\n", - " nanotron_model.model.decoder[i].pp_block.post_attention_layernorm.weight.copy_(hf_model.model.layers[i].post_attention_layernorm.weight)# = hf_model.model.layers[i].post_attention_layernorm.weight\n", - " \n", - "# Last layer norm\n", - "assert nanotron_model.model.final_layer_norm.pp_block.weight.shape == hf_model.model.norm.weight.shape\n", - "with torch.no_grad():\n", - " nanotron_model.model.final_layer_norm.pp_block.weight.copy_(hf_model.model.norm.weight)# = hf_model.model.norm.weight\n", - "# LM_Head\n", - "assert nanotron_model.model.lm_head.pp_block.weight.shape == hf_model.lm_head.weight.shape\n", - "with torch.no_grad():\n", - " nanotron_model.model.lm_head.pp_block.weight.copy_(hf_model.lm_head.weight)# = hf_model.lm_head.weight" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading readme: 100%|██████████| 2.15k/2.15k [00:00<00:00, 13.8MB/s]\n" - ] - } - ], - "source": [ - "\"\"\"\n", - "import importlib\n", - "import nanotron\n", - "importlib.reload(nanotron.data.chat_dataset)\n", - "importlib.reload(nanotron.data.collator)\n", - "\"\"\"\n", - "\n", - "from nanotron.data.chat_dataset import ChatDataset\n", - "from nanotron.data.dataloader_builder import build_chat_dataloader\n", - "\n", - "train_dataset = ChatDataset(\n", - " dataset_path=\"Open-Orca/SlimOrca\",\n", - " tokenizer_name_or_path=PATH_TO_LLAMA,\n", - " sequence_length=2048,\n", - " train_on_completions_only=True,\n", - " remove_cross_attention=True,\n", - " split=\"train\",\n", - " conversation_column_name=\"conversations\",\n", - " dp_rank=parallel_context.dp_pg.rank(),\n", - " dp_ranks_size=parallel_context.dp_pg.size(),\n", - ")\n", - "\n", - "# Prepare dataloader\n", - "train_dataloader = build_chat_dataloader(\n", - " dataset=train_dataset,\n", - " sequence_length=2048,\n", - " parallel_context=parallel_context,\n", - " input_pp_rank=0,\n", - " output_pp_rank=0,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input_ids': tensor([[128000, 128006, 26380, ..., 16686, 13, 128009]],\n", - " dtype=torch.int32),\n", - " 'position_ids': tensor([[ 0, 1, 2, ..., 576, 577, 578]], dtype=torch.int32),\n", - " 'label_ids': tensor([[128006, 26380, 128007, ..., 13, 128009, 128001]],\n", - " dtype=torch.int32),\n", - " 'label_mask': tensor([[False, False, False, ..., True, True, True]])}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "batch = next(iter(train_dataloader))\n", - "batch" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'input_ids': tensor([[128000, 128006, 26380, ..., 16686, 13, 128009]],\n", - " dtype=torch.int32), 'position_ids': tensor([[ 0, 1, 2, ..., 576, 577, 578]], dtype=torch.int32), 'label_ids': tensor([[128006, 26380, 128007, ..., 13, 128009, 128001]],\n", - " dtype=torch.int32), 'label_mask': tensor([[False, False, False, ..., True, True, True]])}\n", - "{'input_ids': tensor([[128000, 128006, 9125, ..., 27065, 13, 128009]],\n", - " dtype=torch.int32), 'position_ids': tensor([[ 0, 1, 2, ..., 517, 518, 519]], dtype=torch.int32), 'label_ids': tensor([[128006, 9125, 128007, ..., 13, 128009, 128001]],\n", - " dtype=torch.int32), 'label_mask': tensor([[False, False, False, ..., True, True, True]])}\n", - "{'input_ids': tensor([[128000, 128006, 9125, ..., 62491, 13, 128009]],\n", - " dtype=torch.int32), 'position_ids': tensor([[ 0, 1, 2, ..., 641, 642, 643]], dtype=torch.int32), 'label_ids': tensor([[128006, 9125, 128007, ..., 13, 128009, 128001]],\n", - " dtype=torch.int32), 'label_mask': tensor([[False, False, False, ..., True, True, True]])}\n", - "{'input_ids': tensor([[128000, 128006, 9125, ..., 15507, 13, 128009]],\n", - " dtype=torch.int32), 'position_ids': tensor([[ 0, 1, 2, ..., 86, 87, 88]], dtype=torch.int32), 'label_ids': tensor([[128006, 9125, 128007, ..., 13, 128009, 128001]],\n", - " dtype=torch.int32), 'label_mask': tensor([[False, False, False, ..., True, True, True]])}\n" - ] - } - ], - "source": [ - "for i, batch in enumerate(train_dataloader):\n", - " print(batch)\n", - " if i == 3:\n", - " break" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "assert batch[\"input_ids\"].shape == batch[\"label_ids\"].shape \n", - "assert batch[\"input_ids\"].shape == batch[\"position_ids\"].shape\n", - "assert batch[\"input_ids\"].shape == batch[\"label_mask\"].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LlamaForSFT(\n", - " (model): LlamaModel(\n", - " (token_position_embeddings): PipelineBlock(\n", - " pp_rank=0\n", - " (pp_block): Embedding(\n", - " (token_embedding): TensorParallelEmbedding(tp_rank=0, 128256, 4096, unsharded_num_embeddings=128256)\n", - " (position_embedding): LlamaRotaryEmbedding()\n", - " )\n", - " )\n", - " (decoder): ModuleList(\n", - " (0-31): 32 x PipelineBlock(\n", - " pp_rank=0\n", - " (pp_block): LlamaDecoderLayer(\n", - " (input_layernorm): TritonRMSNorm()\n", - " (attn): CausalSelfAttention(\n", - " (qkv_proj): TensorParallelColumnLinear(tp_rank=0, in_features=4096, out_features=6144, bias=False, unsharded_out_features=6144)\n", - " (o_proj): TensorParallelRowLinear(tp_rank=0, in_features=4096, out_features=4096, bias=False, unsharded_in_features=4096)\n", - " )\n", - " (post_attention_layernorm): TritonRMSNorm()\n", - " (mlp): MLP(\n", - " (gate_up_proj): TensorParallelColumnLinear(tp_rank=0, in_features=4096, out_features=28672, bias=False, unsharded_out_features=28672)\n", - " (down_proj): TensorParallelRowLinear(tp_rank=0, in_features=14336, out_features=4096, bias=False, unsharded_in_features=14336)\n", - " (split_silu_mul): GLUActivation(\n", - " (act): SiLUActivation()\n", - " )\n", - " )\n", - " )\n", - " )\n", - " )\n", - " (final_layer_norm): PipelineBlock(\n", - " pp_rank=0\n", - " (pp_block): TritonRMSNorm()\n", - " )\n", - " (lm_head): PipelineBlock(\n", - " pp_rank=0\n", - " (pp_block): TensorParallelColumnLinear(tp_rank=0, in_features=4096, out_features=128256, bias=False, unsharded_out_features=128256)\n", - " )\n", - " (cast_to_fp32): PipelineBlock(pp_rank=0)\n", - " )\n", - " (loss): PipelineBlock(\n", - " pp_rank=0\n", - " (pp_block): Loss()\n", - " )\n", - ")" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# TODO(tj.solergibert) Comparar LlamaModel vs LlamaModel, nada de causal ni SFT!\n", - "# TODO(tj.solergibert) Vale, ya lo estabamos haciendo.\n", - "# TODO(tj.solergibert) Quedaria revisar lo de la LOSS, mierda. Tendremos que hacer una reduccion y usar la de pytorch\n", - "# TODO(tj.solergibert) Para asegurarnos que todo bien Y LUEGO YA SI ESO LO DE LA MASK.\n", - "hf_model.eval()\n", - "nanotron_model.eval()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1 a 1" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "input_ids = batch[\"input_ids\"].cuda()\n", - "position_ids = batch[\"position_ids\"].cuda()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "n_embedd = nanotron_model.model.token_position_embeddings(input_ids=input_ids, position_ids=position_ids)\n", - "n_embedd[\"hidden_states\"] = n_embedd.pop(\"input_embeds\")" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "hf_embedd = hf_model.model.embed_tokens(input_ids)\n", - "hf_position_embeddings = hf_model.model.rotary_emb(hf_embedd, position_ids)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "assert_close(n_embedd[\"hidden_states\"].transpose(0,1), hf_embedd) # TODO(tj.solergibert) Embeddings now are equal!\n", - "assert_close(n_embedd[\"cos\"], hf_position_embeddings[0])\n", - "assert_close(n_embedd[\"sin\"], hf_position_embeddings[1])" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n" - ] - } - ], - "source": [ - "n_hidden_encoder_states = nanotron_model.model.decoder[0](**n_embedd)" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'hidden_states': tensor([[[ 0.0014, 0.0040, -0.0050, ..., 0.0093, -0.0007, 0.0005]],\n", - " \n", - " [[ 0.0065, 0.0144, 0.0079, ..., -0.0157, -0.0422, -0.0073]],\n", - " \n", - " [[-0.0117, -0.0225, 0.0166, ..., -0.0114, -0.0019, 0.0105]],\n", - " \n", - " ...,\n", - " \n", - " [[ 0.0205, 0.0003, -0.0043, ..., -0.0337, 0.0027, -0.0114]],\n", - " \n", - " [[ 0.0017, -0.0008, 0.0084, ..., 0.0054, 0.0016, 0.0060]],\n", - " \n", - " [[-0.0025, -0.0031, -0.0141, ..., -0.0088, 0.0073, 0.0090]]],\n", - " device='cuda:0', dtype=torch.bfloat16, grad_fn=),\n", - " 'position_ids': tensor([[ 0, 1, 2, ..., 576, 577, 578]], device='cuda:0',\n", - " dtype=torch.int32),\n", - " 'cos': tensor([[[ 1.0000, 1.0000, 1.0000, ..., 1.0000, 1.0000, 1.0000],\n", - " [ 0.5391, 0.6875, 0.7891, ..., 1.0000, 1.0000, 1.0000],\n", - " [-0.4160, -0.0583, 0.2412, ..., 1.0000, 1.0000, 1.0000],\n", - " ...,\n", - " [-0.4629, -0.4336, 0.5078, ..., 1.0000, 1.0000, 1.0000],\n", - " [ 0.4941, 0.3574, 0.9297, ..., 1.0000, 1.0000, 1.0000],\n", - " [ 1.0000, 0.9258, 0.9609, ..., 1.0000, 1.0000, 1.0000]]],\n", - " device='cuda:0', dtype=torch.bfloat16),\n", - " 'sin': tensor([[[ 0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00,\n", - " 0.0000e+00, 0.0000e+00],\n", - " [ 8.3984e-01, 7.2656e-01, 6.1719e-01, ..., 3.6955e-06,\n", - " 3.0100e-06, 2.4587e-06],\n", - " [ 9.1016e-01, 1.0000e+00, 9.6875e-01, ..., 7.3910e-06,\n", - " 6.0201e-06, 4.9174e-06],\n", - " ...,\n", - " [-8.8672e-01, -9.0234e-01, -8.6328e-01, ..., 2.1362e-03,\n", - " 1.7395e-03, 1.4114e-03],\n", - " [-8.6719e-01, -9.3359e-01, -3.6719e-01, ..., 2.1362e-03,\n", - " 1.7395e-03, 1.4191e-03],\n", - " [-5.2979e-02, -3.8086e-01, 2.8320e-01, ..., 2.1362e-03,\n", - " 1.7395e-03, 1.4191e-03]]], device='cuda:0', dtype=torch.bfloat16)}" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "n_hidden_encoder_states" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n" - ] - } - ], - "source": [ - "hf_hidden = hf_model.model.layers[0](hf_embedd, position_ids=position_ids, position_embeddings=hf_position_embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(tensor([[[ 0.0014, 0.0040, -0.0050, ..., 0.0093, -0.0007, 0.0005],\n", - " [ 0.0064, 0.0146, 0.0078, ..., -0.0157, -0.0425, -0.0073],\n", - " [-0.0117, -0.0225, 0.0167, ..., -0.0115, -0.0018, 0.0106],\n", - " ...,\n", - " [ 0.0205, 0.0004, -0.0043, ..., -0.0334, 0.0027, -0.0114],\n", - " [ 0.0017, -0.0008, 0.0084, ..., 0.0054, 0.0016, 0.0061],\n", - " [-0.0025, -0.0032, -0.0141, ..., -0.0087, 0.0073, 0.0090]]],\n", - " device='cuda:0', dtype=torch.bfloat16, grad_fn=),)" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hf_hidden" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "ename": "AssertionError", - "evalue": "Tensor-likes are not close!\n\nMismatched elements: 1151415 / 7770112 (14.8%)\nGreatest absolute difference: 0.001953125 at index (0, 442, 3824) (up to 1e-05 allowed)\nGreatest relative difference: inf at index (0, 2, 2232) (up to 0.016 allowed)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[40], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43mn_hidden_encoder_states\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhidden_states\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhf_hidden\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", - "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 1151415 / 7770112 (14.8%)\nGreatest absolute difference: 0.001953125 at index (0, 442, 3824) (up to 1e-05 allowed)\nGreatest relative difference: inf at index (0, 2, 2232) (up to 0.016 allowed)" - ] - } - ], - "source": [ - "assert_close(n_hidden_encoder_states[\"hidden_states\"].transpose(0,1), hf_hidden[0])" - ] - }, - { - "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[ 0.0014, 0.0040, -0.0050, ..., 0.0093, -0.0007, 0.0005],\n", - " [ 0.0060, 0.0125, 0.0074, ..., -0.0181, -0.0356, -0.0070],\n", - " [-0.0164, -0.0225, 0.0219, ..., -0.0098, -0.0084, 0.0156],\n", - " ...,\n", - " [ 0.0121, 0.0106, -0.0149, ..., -0.0229, -0.0056, -0.0021],\n", - " [ 0.0065, 0.0256, -0.0107, ..., -0.0027, -0.0085, 0.0192],\n", - " [ 0.0025, 0.0199, -0.0267, ..., -0.0056, -0.0045, 0.0182]]],\n", - " device='cuda:0', dtype=torch.bfloat16, grad_fn=)" - ] - }, - "execution_count": 59, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "n_hidden_encoder_states[\"hidden_states\"].transpose(0,1)" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[[ 0.0014, 0.0040, -0.0050, ..., 0.0093, -0.0007, 0.0005],\n", - " [ 0.0064, 0.0146, 0.0078, ..., -0.0157, -0.0425, -0.0073],\n", - " [-0.0117, -0.0225, 0.0167, ..., -0.0115, -0.0018, 0.0106],\n", - " ...,\n", - " [ 0.0205, 0.0004, -0.0043, ..., -0.0334, 0.0027, -0.0114],\n", - " [ 0.0017, -0.0008, 0.0084, ..., 0.0054, 0.0016, 0.0061],\n", - " [-0.0025, -0.0032, -0.0141, ..., -0.0087, 0.0073, 0.0090]]],\n", - " device='cuda:0', dtype=torch.bfloat16, grad_fn=)" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hf_hidden[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n" - ] - } - ], - "source": [ - "with torch.inference_mode():\n", - " output_nanotron = nanotron_model.model(input_ids=batch[\"input_ids\"].cuda(), position_ids = batch[\"position_ids\"].cuda())" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n", - "tensor(579, device='cuda:0', dtype=torch.int32)\n", - "tensor([ 0, 164, 443, 935, 1208, 1318, 1897], device='cuda:0',\n", - " dtype=torch.int32)\n" - ] - } - ], - "source": [ - "with torch.inference_mode():\n", - " output_hf = hf_model(input_ids=batch[\"input_ids\"].cuda(), position_ids = batch[\"position_ids\"].cuda())" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "ename": "AssertionError", - "evalue": "Tensor-likes are not close!\n\nMismatched elements: 1013596 / 243301632 (0.4%)\nGreatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 0.1 allowed)\nGreatest relative difference: 537153.0 at index (0, 406, 16297) (up to 0.1 allowed)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[20], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_hf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlogits\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moutput_nanotron\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranspose\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43matol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-1\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrtol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-1\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", - "\u001b[0;31mAssertionError\u001b[0m: Tensor-likes are not close!\n\nMismatched elements: 1013596 / 243301632 (0.4%)\nGreatest absolute difference: 3.58984375 at index (0, 373, 33435) (up to 0.1 allowed)\nGreatest relative difference: 537153.0 at index (0, 406, 16297) (up to 0.1 allowed)" - ] - } - ], - "source": [ - "assert_close(output_hf.logits, output_nanotron.transpose(0,1), atol=1e-1, rtol=1e-1)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[HF Model] Next token: 704, probability: 0.9999432563781738\n", - "[HF Model] Next token: 14, probability: 3.535549694788642e-05\n", - "[HF Model] Next token: 6917, probability: 1.67007528943941e-05\n", - "[HF Model] Next token: 1057, probability: 1.5534121757809771e-06\n", - "[HF Model] Next token: 320, probability: 1.209798483614577e-06\n", - "[HF Model] Next token: 315, probability: 9.421920026397856e-07\n", - "[HF Model] Next token: 412, probability: 1.637284157141039e-07\n", - "[HF Model] Next token: 9994, probability: 9.930631250654187e-08\n", - "[HF Model] Next token: 12, probability: 8.763750969364992e-08\n", - "[HF Model] Next token: 6033, probability: 6.825216303241177e-08\n" - ] - } - ], - "source": [ - "predicted_token = 345\n", - "\n", - "next_tokens_hf = torch.softmax(output_hf.logits[0, predicted_token, :], -1)\n", - "hf_topk_next_tokens= torch.topk(next_tokens_hf, 10)\n", - "\n", - "\n", - "print(*[f\"[HF Model] Next token: {idx.item()}, probability: {prob}\" for idx, prob in zip(hf_topk_next_tokens.indices, hf_topk_next_tokens.values)], sep=\"\\n\")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Nanotron Model] Next token: 704, probability: 0.9999523162841797\n", - "[Nanotron Model] Next token: 14, probability: 3.120139808743261e-05\n", - "[Nanotron Model] Next token: 6917, probability: 1.3006677363591734e-05\n", - "[Nanotron Model] Next token: 1057, probability: 1.209809511237836e-06\n", - "[Nanotron Model] Next token: 320, probability: 9.422005859960336e-07\n", - "[Nanotron Model] Next token: 315, probability: 8.3148904650443e-07\n", - "[Nanotron Model] Next token: 412, probability: 1.2751297617796808e-07\n", - "[Nanotron Model] Next token: 9994, probability: 7.734053042440792e-08\n", - "[Nanotron Model] Next token: 12, probability: 6.825278120459188e-08\n", - "[Nanotron Model] Next token: 21337, probability: 6.023287113521292e-08\n" - ] - } - ], - "source": [ - "next_tokens_nanotron = torch.softmax(output_nanotron.transpose(0,1)[0, predicted_token, :], -1)\n", - "nanotron_topk_next_tokens= torch.topk(next_tokens_nanotron, 10)\n", - "\n", - "\n", - "print(*[f\"[Nanotron Model] Next token: {idx.item()}, probability: {prob}\" for idx, prob in zip(nanotron_topk_next_tokens.indices, nanotron_topk_next_tokens.values)], sep=\"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Comprobar loss con las masks!\n", - "HF no have lo de train on completitions only, o si? Creo que no tiene atten mask para los labels, asi que primero lo hacemos manual y luego a mano con su formula de crossentropy a mano con los -100!" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "tensor(0.9076, device='cuda:0')\n" - ] - } - ], - "source": [ - "# Nanotron\n", - "nanotron_loss = nanotron_model.loss(\n", - " sharded_logits=output_nanotron,\n", - " label_ids=batch[\"label_ids\"].cuda(),\n", - " label_mask=batch[\"label_mask\"].cuda(),\n", - " )[\"loss\"]\n", - "print(nanotron_loss)" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "\n", - "def build_labels_completions_only(input_ids, is_completitions):\n", - " labels = np.where(\n", - " is_completitions, input_ids, -100\n", - " ) # Mask tokens that don't belong to the completitions by the Assistant\n", - " return torch.tensor(np.array(labels, dtype=np.int64))" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "torch.Size([1897, 128256])\n", - "torch.Size([1897])\n", - "tensor(0.9081, device='cuda:0')\n" - ] - } - ], - "source": [ - "# HF\n", - "from torch.nn import CrossEntropyLoss\n", - "\n", - "hf_labels = build_labels_completions_only(batch[\"label_ids\"].flatten().tolist(), batch[\"label_mask\"].flatten().tolist())\n", - "\n", - "shift_logits = output_hf.logits.contiguous()\n", - "shift_labels = hf_labels.contiguous()\n", - "loss_fct = CrossEntropyLoss()\n", - "\n", - "shift_logits = shift_logits.view(-1, 128256)\n", - "shift_labels = shift_labels.view(-1)\n", - "# Enable model parallelism\n", - "shift_labels = shift_labels.to(\"cuda\")\n", - "hf_loss = loss_fct(shift_logits, shift_labels)\n", - "print(hf_loss)" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "ename": "AssertionError", - "evalue": "Scalars are not close!\n\nExpected 0.9080765247344971 but got 0.9075685739517212.\nAbsolute difference: 0.0005079507827758789 (up to 0.0001 allowed)\nRelative difference: 0.0005593700188697129 (up to 0.0001 allowed)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[58], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43massert_close\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnanotron_loss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhf_loss\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43matol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-4\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrtol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1e-4\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/testing/_comparison.py:1520\u001b[0m, in \u001b[0;36massert_close\u001b[0;34m(actual, expected, allow_subclasses, rtol, atol, equal_nan, check_device, check_dtype, check_layout, check_stride, msg)\u001b[0m\n\u001b[1;32m 1498\u001b[0m error_metas \u001b[38;5;241m=\u001b[39m not_close_error_metas(\n\u001b[1;32m 1499\u001b[0m actual,\n\u001b[1;32m 1500\u001b[0m expected,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1515\u001b[0m msg\u001b[38;5;241m=\u001b[39mmsg,\n\u001b[1;32m 1516\u001b[0m )\n\u001b[1;32m 1518\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error_metas:\n\u001b[1;32m 1519\u001b[0m \u001b[38;5;66;03m# TODO: compose all metas into one AssertionError\u001b[39;00m\n\u001b[0;32m-> 1520\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_metas[\u001b[38;5;241m0\u001b[39m]\u001b[38;5;241m.\u001b[39mto_error(msg)\n", - "\u001b[0;31mAssertionError\u001b[0m: Scalars are not close!\n\nExpected 0.9080765247344971 but got 0.9075685739517212.\nAbsolute difference: 0.0005079507827758789 (up to 0.0001 allowed)\nRelative difference: 0.0005593700188697129 (up to 0.0001 allowed)" - ] - } - ], - "source": [ - "assert_close(nanotron_loss, hf_loss, atol=1e-4, rtol=1e-4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save the Nanotron model" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [], - "source": [ - "from nanotron.parallel.parameters import sanity_check\n", - "\n", - "sanity_check(root_module=nanotron_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Saving weights: 100%|██████████| 195/195 [00:41<00:00, 4.67it/s]\n" - ] - } - ], - "source": [ - "from pathlib import Path\n", - "from nanotron.serialize import save_meta, save_weights, TrainingMetadata\n", - "from nanotron.serialize.metadata import DataStageMetadata\n", - "\n", - "out_path = \"/mloscratch/homes/solergib/converter/nanotron/n_c/first/\"\n", - "out_path = Path(out_path)\n", - "\n", - "save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=out_path)\n", - "\n", - "training_metadata = TrainingMetadata(last_train_step=0, consumed_train_samples=0, data_stages=[DataStageMetadata(name=\"Empty\", consumed_train_samples=0, start_training_step=0)])\n", - "\n", - "save_meta(root_folder=out_path, parallel_context=parallel_context, training_metadata=training_metadata)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Saving config ...\n", - "Saving model config ...\n" - ] - } - ], - "source": [ - "import json \n", - "import yaml\n", - "from nanotron.config import GeneralArgs, ModelArgs, TokenizerArgs, Config\n", - "from nanotron.config.models_config import ExistingCheckpointInit\n", - "from dataclasses import asdict\n", - "\n", - "with open(out_path / \"config.yaml\", \"w\") as f:\n", - " config = Config(\n", - " general=GeneralArgs(project=\"conversion\", run=\"Llama3-8B\"),\n", - " parallelism=parallel_config,\n", - " model=ModelArgs(\n", - " init_method=ExistingCheckpointInit(out_path),\n", - " model_config=nanotron_config,\n", - " ),\n", - " tokenizer=TokenizerArgs(PATH_TO_LLAMA),\n", - " )\n", - " print(\"Saving config ...\")\n", - " yaml.dump(config.as_dict(), f)\n", - "\n", - "with open(out_path / \"model_config.json\", \"w\") as f:\n", - " print(\"Saving model config ...\")\n", - " json.dump(asdict(nanotron_config), f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/mloscratch/homes/solergib/SFT/transformers/src/transformers/deepspeed.py:24: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'input_ids': tensor([[27, 22, 0, 97, 13, 49, 56, 35, 70, 91, 38, 30, 26, 94, 68, 46, 89, 32,\n", - " 70, 85, 50, 67, 70, 86, 66, 82, 18, 72, 27, 37, 91, 27, 60, 57, 23, 93,\n", - " 10, 80, 82, 26, 13, 50, 12, 68, 63, 85, 55, 1, 3, 61, 37, 70, 12, 97,\n", - " 1, 59, 90, 45, 74, 62, 66, 54, 94, 18, 54, 89, 49, 3, 66, 55]],\n", - " device='cuda:0'), 'position_ids': tensor([[0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0, 1, 2,\n", - " 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5,\n", - " 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6]],\n", - " device='cuda:0')}\n" - ] - } - ], - "source": [ - "import sys\n", - "sys.path.append(\"/mloscratch/homes/solergib/SFT/transformers\")\n", - "\n", - "import torch\n", - "from t_tests.models.llama.test_modeling_llama import LlamaModelTester\n", - "\n", - "lmt = LlamaModelTester(parent=None)\n", - "\n", - "_, inputs_dict = lmt.prepare_config_and_inputs_for_common()\n", - "dummy_attention_mask = inputs_dict[\"attention_mask\"]\n", - "inputs_dict[\"input_ids\"][~dummy_attention_mask.bool()] = 0\n", - "\n", - "padfree_inputs_dict = {\n", - " k: v[dummy_attention_mask.bool()].unsqueeze(0)\n", - " for k, v in inputs_dict.items()\n", - " if not k == \"attention_mask\"\n", - "}\n", - "\n", - "padfree_inputs_dict[\"position_ids\"] = (\n", - " torch.cat([torch.arange(length) for length in dummy_attention_mask.sum(1).tolist()])\n", - " .long()\n", - " .unsqueeze(0)\n", - " .to(\"cuda\")\n", - ")\n", - "\n", - "print(padfree_inputs_dict)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs/sft.md b/docs/sft.md new file mode 100644 index 00000000..e3d104c6 --- /dev/null +++ b/docs/sft.md @@ -0,0 +1,56 @@ +# LlamaSFT +## Introduction +We have incorporated the ability to perform SFT in nanotron with the following features: +1. Packing multiple samples to fill the sequence length of the model +2. Training on completions only: The model learns from the answers, not from the user prompt & chat templates +3. Removing cross-attention between the multiple samples packed + +In the following sections, we will delve into more detail about these features and how we have implemented them. + +### Feature 1: Packing +To train the models efficiently, we will pack multiple conversations into the same sample until filling the sequence length. As we are packing multiple sequences and to avoid introducing padding tokens, [we will flatten the batch size](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/trainer.py#L259), so `sequence_length = micro_batch_size * sequence_length` and `micro_batch_size = 1`. +![](sft_feature1.png) + +### Feature 2: Training only on completions +Conversations consist of user messages, which are usually questions or inquiries, and the model's responses. The ultimate goal is for the model to improve the quality of its responses, and not so much to learn about user questions or other aspects like the chat template. Therefore, during training, we will compute the loss only with the tokens that belong to the answers produced by the model. + +To achieve this, when tokenizing the conversations, we will [store the role of each token](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/data/chat_tokenizer.py#L59) and create an attention mask that the model will use in the loss computation [[1]](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/models/llama_sft.py#L617), [[2]](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/models/llama_sft.py#L603). +![](sft_feature2.png) + +### Feature 3: Removing cross-attention +Finally, as we are packing multiple conversations together, we do not want the tokens of one conversation to attend to those of other conversations. +To do this, we will store the `position_ids` of each token in the sequence length to: +1. Apply the RoPE embeddings correctly to each conversation +2. [Create the attention mask](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/models/llama_sft.py#L346) needed by [`flash_attn_varlen_func`](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/models/llama_sft.py#L352) to compute the attention without cross-contamination between different conversations +![](sft_feature3.png) + +## Internals +### Config file +For SFT, we need to setup the config file as follows: +```yaml +- data: + dataset: + hf_dataset: Magpie-Align/Magpie-Pro-300K-Filtered + hf_dataset_split: train + conversation_column_name: conversations + train_on_completions_only: true + remove_cross_attention: true + num_loading_workers: 1 + seed: 42 + name: General purpose training (Single dataset) + start_training_step: 1 +``` +The `hf_dataset` should be a dataset from the HuggingFace Hub with the same structure as `Magpie-Align/Magpie-Pro-300K-Filtered`; that is, each conversation will be a list of dictionaries, each with the keys `from` [`gpt`, `human`] and `value`. We can select a split with `hf_dataset_split` and the dataset column with `conversation_column_name`. `train_on_completions_only` & `remove_cross_attention` are to toggle on/off Features 2 and 3, but we will remove them for the final release. + +### Iterable Dataset +For SFT training, we have developed a new dataset, [`ChatDataset`](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/data/chat_dataset.py#L17), responsible for producing data batches during training. Unlike `Nanosets`, this new `ChatDataset` is an [`IterableDataset`](https://pytorch.org/docs/stable/data.html#iterable-style-datasets). The advantage of this type of dataset is that they do not require preprocessing the data before training as they do it on-the-fly, saving us the preprocessing step and the space occupied by the preprocessed data. The downside is that it is not trivial to recover the state of the DataLoader when restarting training. For this, we are developing a solution based on `torchdata`'s [`StatefulDataLoader`](https://github.com/pytorch/data/tree/main/torchdata/stateful_dataloader) that we will incorporate soon. + +For now, we allow splitting the dataset between the different data parallel ranks and plan to support interleaved datasets. + +### ChatTokenizer +To apply the chat template, tokenize the conversations, and store the role of each token, we have developed the [`ChatTokenizer`](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/data/chat_tokenizer.py#L6). Based on the one included in [`meta/llama3`](https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py), [this tokenizer will return](https://github.com/swiss-ai/nanotron/blob/c026422e5bf0bc1086c039e65d8f7bbe75dc9728/src/nanotron/data/chat_dataset.py#L92) the `tokens` of the conversation and the list of bools `is_completions` indicating whether the token belongs to the model's responses or not, necessary for Feature 2. + +For now, we only support the Llama3 tokenizer along with the official chat template of this model. + +### Recover DataLoader +Pending development diff --git a/docs/sft_feature1.png b/docs/sft_feature1.png new file mode 100644 index 0000000000000000000000000000000000000000..162322f05e175dbf4417425ae648648cf9ed4dd6 GIT binary patch literal 17109 zcmeHucUaTewkRHEU>pUUqky71 z|9;fTN2>2`{u-wyb>>jRhBjFH=sHnY-agU%ym5P^{`sSXeJDTW<7cdnn5>Rn*>yp} zcvwRI$cp5bLL`o~ze{*V`~98+lnTlmXNnzKL#7xWfOQE!uq*0n)xpXC7yk+x&e7*! zDksw+{n9W|ibJ2Tp?7~2tlhk|1tTWwTIq}p)x5$6YJr8aUD0!aPL(sQRXYX^1M>xM z#thLLVV+D4F<`2PJ3Su_&6|DVthivA{;1X;L5SsB)A6iTD8KuamWZtY=)mm7w!$y}{c2p@W#wATmef1sd>89P$%GJ%fl-@Pwcgh2PUK}gbVv;Iqi=gC z^z^774f5HgE2QV-=PUOyIDKy~!sg5(er1o4YVS=oJcFIcenVM+uQwF+9p2lI_ zBR!?qQB@Hc4JfAV$r9w~)k6qosgqz(lwqL5W{^-e6p)PD>}EE=nop^sY^Ti)@iJofQavPMO{imry$nu z)#)ZCNhQwj+Ftx9Geq8E^<6is4uxY`F30XyACiJHDvquWb30kl(psfXD0FxGT<*b~ z^EvID=zwJHCR+ZCdTKUN6X|w%9a-3ZujOccshSr4Vh}gOg!}wyr5+<-qFuqlJi@5% zI?P80wZa;n>7`?Znt>FJmVSPby-85+N?GT2IKa6>823En$DSWeIBRBZo9M5OdupFA zS&EkggbqGWNBM>Lo(|&XHJ%u1nvo-7)QMFQ1ak3!+nV{_?C{0EE88c!J`?s7ua^+! zs=MALDM+TSFKu)zee;HtY15A9D!`O&5*yG9R%%kl40;Cq-CDJe_hytIIYTxx;-0M^ z3P_!eoP1`&jmr{ToE!BDO3oeX^0mJ{>JOxdvfC%D9i7wJ5*JZNTSNLe!Hd% zmNg`ZGuO*gkJ0qi%5y0o@fKZA{e&!{63w?7dgjT0w|ZUog>%KB;rbgKWSq%-)nrATNyAp5q5P z$pJd|Y}0wAhcCdr$xRL++pWGhB5zc)kpHy8Qfaefww|k1ac|@`O!lUq2rCPYrEOG? z*7KTKVV#!9RW_m@8uniu>lV^wR_`(4gJS;hLFam0Rc$)#v`q5I%oPSPZ6D0i)F?|G zza$+;hHQyrDF-fTrn~9Q-@S)#id46VOK*`j-s%GL5>s!aW(SA?DOu1 zq5P_EoI#k##j6v^%4x3FiE6g@s!mr(oA2AO=JxuZ(`^$mN^uuDgoeeankr+&7{`>i^<~al=wvlZ8~fT!>WQqiRMteiRChH^!rSbW(no66x?&GJ zTw$%{5-86<&{pxo7km?_hb0|#z zP%l@lp1>&u2q(B{ZkbK_DTH=WLbf24;WIjO9XqrnE#|COOs)$o(FYah_ET3VBC~RH zoZl+b*iA_I9Pq^c&iB~3k^#}W- z644EVpK?fp79@vs+pNAkLVX*0+r{d%3~pc~Nnd202qpWXv9RD>VR-Cr$UTUZfiHQ& z+(kjMT+n@V>%ztQpLcolACa7EF*s8=l;Q4K6H@1l(;gL^oLS+#x*+4+JS0>Pko&*q z58G-slM!~F-j5zt)I1k3p1s|y-M&1ed3#6S;$n0_s99&;#!M)(At85d^4p77XwSQ$ zPNe~#dKI-@c*OyfF4&2jN^L+|*4FWSSyQau<_`$3>({4i1NMY-H!m1{JSc}~yr_*> zYH{Hwk6H?@PYKnKHSC%)gddE$M4%PTiN@O#yr24{2luzeKVG}UM-^2?d>-pzx$*{j zs8C*k=R#fA5xW=XYUuPIS1Bw7&hU1(IaJ>mW_4b?A(}2(3C{NSu%=&Njkog00L|Tw z*DGe`l$ixh&H;mbs4vAHiZA!e(s1|Y(Q6bdTsUoV70L^fO|E!$kfGpm$H?~H>m{@v z*VFOs+hD6on|1?)q>+y#a??b+sB0#eGWtdo(&}s1xWWVRVVC=!^aYA*xeI*Sw@wm_bk3DP;z495 zuXm!px`G`t z(zy(QQpsx)Y-ZjDG(~mQDuO5%F`jl9XHHW2@fYtDWdOS97NLTc1Dr;N&Nol7B#ljVg=sV9`(t#U?oR@J>S9nRx8h)GEA zNi|{-fDMhjvfi~Ktd*-BAvDyG4oo;>ryWW$)YWn=8R>OzwNKPO3h=D%OAVaLh-~ z>+cs;k%x1TdUVr`tSCLx)IwzuX=3oD8=G!9KJ^QLkbHS#_zm`Kv5P;Y7a<@MSEQ-7 z+Flh3I(xwB81t47&dfkyEFvLw6s@|9rFq5r7k;02nEF#%hCa9Tq*hGfo?y%Xp84K| zmPeTCr70-anfA&&VZ62sDQsFOjGax>WPq&xAs`)LqhE2_#GgcTU@RSlRUO3JIsKK75ph5Qu6qUDRc}A9`il(M7a(&9r7P+vR;OSxnvPG_VidJFI z3oN4!kIVMU+F=@Xt!}e-u(LzPY$C=E5c&Rrq#R^k7{8eROhfuw!v2DL2p-G9aHfm!Dq>q3vbob4N#$*uy0 zkiHyvcKsUk5rR-h#i;;Fz_s_rd@84=%X-h$Oh7p8kHqec?l;z;%qBL>iL!_*?9=6? zj6JM;k?*Ay>6RsPEcOKh!ky=GI5HKt-w_AE3tEigLq1L{(RQ{MZ@S-Yf8lVBlsu;i ze&gxVIzGfzC9g}wd9PS5YcQ{@cC|$jd=aQWSsWu2r1WB+`5iuc_4bMfN@oUDSg&vM zVB$NcSZE`=>Kyo+Lni#oXkz}?PM2^HNI0j-0{#6H!lAw%o)eUX`a@qyvZ$N(WuJf| zuh1f2wEZmk>w$u5=y=pBc77j+%iEIhy-av-BV_sYzoWm{v&AMV zj(-tSZH0`B$i&EXeJZ0a`g*n$RS7*S;XrAuk`H+HQa|VVBkc9J6qfyGF`UlVMv8ii zk_vmBGIzUz8DGj*z6@5ubNe@S>{`vR$Q*xZ( zkn{y{f#069!6kPETbz~7d=*lez0U|WNZVO_F4}L)fB*bNcmIhk8g>|Ol|fTUpX)>Q z#LINcix)2@u`kq`BgCF{{3Px0>4em?FKoFZKLS@0u}j)5fmH5RL{waUBQX6n4|DS5 zJ#BFzVtvRY-b_(FP;{=4Nf0Y6?~H7tD%u@LN!vv%hJ3#*J50?rdY;1>;ui@?lGHKv za40!TLy6x58@(yk)TS%0VwY<@_-BFT|J9>k%mT;K4XN%b+7E%8tvUVV>L_4L(=G4t zhkSXlIknMWQ&`vcb|=)(x=VebJGzYeY-@o^e3A98g{z!SF|*PEvjP>#@1(cVou>=o zAF^7@5Qi#{qx&)`X3GZP`KH)z=j+>4{!tKIf?*zs(rxE4`jm3rkbhQN_?qk?0*)vx zGjM3C>7r+1oUk7oAZB+y@#Z-SL6>;)vAd8g9bB6< z-ulE&tm|SU@Mm5Bm(M)7USE`Q8I|KBh*R_%)Hk-x`Nk_iHSzmpHTg zOgNE~Ii${Yq%sw3lbQn(#|BXxDG#Bt+qt;n(Q9x3B^ zE#egCofh7a&PK=QSb)+_>yvtK45-3h&D8}(OI~mDRD*y7%^Pw~bRCsPQw)lI#rKQ1#I}!mdGMt|n)u)b3z^%7!Odfy&gHb!wHeG3#id zzabK85zO|El83$8mJwy@qE@vy=08ZTX2OaTC1XP5WI1VWhtFCcJN^aw+mXu5-k&0+ zfO+^2dwBj`IjE_brw?`Xqo?ASW8DnXtLZ_OMXg%l2y0pR*#M_XbX5wrM%`y)RE~vo zu7dfJzLc?!lR*HT?&eE>LsB%b|>N?2zqgth{EJJKf(z|6OQDYt>MB zxB{)WziPD9MKU{NqeI#}?^!>sX8oG<1lMzkJ_ih;K(wuPf9C3=AvcA4yNFYB9lrMU z2@@qI+*4WW=NmQd;p`HtPv~u0QT^7TtlwLu zyT##t?NfEjb(AH(oDF_A(N4kWxq8eAOYOXapnNAsm~lyQ&LETWQ#PzwvHJ11{wDf; zkxcBPT;05-i_AISp{#0Qln>72(=QK`gJoku)n`Hn6e~5I&Lpa3;I>N{OKi*jCmF7v zZamT7#177=zbLf>~-3pIX76 z_koh_=*-2evge~g7h*_-DCX|VO}S?+U6fr%AD}sT78cm-$)U4p&n6L|OdNig{v-Fc z#C6=adUy2TfeG3oZsmx>%dT(9dNJVr_;YJ@-QQY-f9Fxuf4 zvu+!?ugmsN6NC_zw*5~LhsGze7jbpTS^$84X4$JAM|{?l3HQaf$Luo)K9yUej+>j! z6^9fCwz7?Mt;z=~hs+vW2IpAAr`P_UBmo84U?v=Wa~p+wdH!{_8H&l-Vw#|emdl;^ zn4FJGT!NqRIHJ?^)$hw?#$zd^Wvtvj^>sXjhAb0tZp z2ShH=kVnzjYU)J8;~k{T-q5yVj6m$s*RMVsE}Z#U)c1X29wSob2x{9SlSz-`m0Irbdy zq-Q*K0aU`2U1NxE|D{GKohsThxwYf@0ndYt&7YZHiQs^YrQZP?M3LdL77fnmF6Rax z?Ufcr5SemUQ>~!NEPrb_m15RwRqK9uKS0p87&0YJi@BRm0s@ra;`~x0qimvmbjqrK zIR^W#RE~WbU(pd6W1J+^7OROq!GA$uD!%pee|_sfnOG@3)#uYl@l{)Qfr^_$bt*7Zi7__fAQ39bQnn!3tuPlh zJM1YNw2Rrs3BT)M?%WV&WVe~%20d*Qfw8=NOdLMXUH@WCbUeS&cW~*Iw)5^MN&cb< zj<35FNabD0q|6sT?rVt%)bqpl7W9w~yy*4sw$@x|Aa!wgXLQeq6U4S1%XUSv>|2HP z!fO+edb7FR1;Qe@p-UuL+xHD)TY%!{eysyF=aYFUv;__3wq9@HP1} zhe0eioH`))7Z0{4et7QtS8r?`?O^^_SslH1-edjW*@GhPL#bsYLY5XRUD};uKU>)F zfR8<1kym^HD}E>?DtQ^W-u%+7UlZ2e{WR9Y2YXunDz^+bY$*Gy2m3#Kz*dqy`SsmdxLWao*xL5nPNKyu0sj;u;;OmB$?ks@4TIvPgfTyVkb@T z8&y=GP~m&Sal;GIe~l=8qE*=2eHp&Z0xk=gffy=cZN9N_{=cq%Piq}(!fg3Z?95xA z9K`rNj;>;qln_TfNU4L0RVEO#4F)GvBIYlzQqWwq=Gpg7Km}f63;AEQBVs>F?cZPL zmN_oBPR2(VlBSF8lLmgUoW@Fg>N|S(V5PEf@`MQS`qqzE^E3Tzm!jH_UKU}Tuz5nm z`%=KibOWy=P&u>D^Zimn`G$kKJ?{@_E;%Mbfl=SVIWhCaJwQHwn2w7`mHTCXJ$ux5 zVBSq7K-;TqRB4{!}DWpF}yuHkoGcx^4KxYQAfy4l1S!`;cCK}Z zmb-sM7w}h4`{nujRerq|GSxI%{)qsYbg7NDfymLB6Hrl2;OE!UqEI)tka)8mtLMV8Qj}7&`J;nZ|-U#Q4MKp7y$^Y=BHId`Cc; zMsam^1r@;;Ugk1sb7oZ=TSOPql|OUhd40dgg@w?9#X+n4XM#UF#%P?8*_zqZ`?ORX z+mA`gwifs0bJrYcaD*!7n`R<8nSP@DaS6#tsGX%X+HobTJ zV1`-D;HXQ}Y)LO-Bc6*-K8sKkbCu_M=pfa9&aiSeqA_YH4GKE(m!B<{*EXd(msqm4 z-ioe59I|-gG95LW>!}_n6QfoA@m9|Q*+$tOntYm#e2^#X=MCn_rYuRM`)^s4A)F*mY1`V#Kk zIVcY7X=^hXA}zt#h6$&~A{R0KR%f+L1{MlsP+e|xl+s!-4wcslPe$~^mU6HbNj_QP zeCYeKd=h^Js4)P4PK`!Jc$5F<5H02Q z<_V{-3D)-!_Vi!Pq&U)|Gly{A?G(cWAZu9RyNj>0MObzUH7U|$Yig&*FYXbIN zd49I94ftddK8NKTW3>R>yr*9Z@84DbZ}Y+Lk^60hb4AP*yXh`?Y}Rm*N_B<$*hVq4PJV(j-w&2Arh80I0KJKU4Mw4MF*e@coF=Gv+aW`V{@YBurB z+G!f%#2T3XK>kOIPOod%ufJecD^={^{1{cXhs`I+azRP@Roc0<+FvVl@ovT6l*Z;M zI%v7E^2qZR`?(YPT$m>*2l)=BJAfGmx}@E zw_GxL*AT-_g@4qg)~6)rx2Y3Ze+R;1hh3s8qHVeI{zZQjR@}~y=R%wkk7Wl^>;O#J z=PXh#I)bSo){NhK0d84tC)Z(A z2eUV=CKN_1cLkzWe8YxoX?pH#p^YescbxtvTYxo9pzn66;bKWmuoJnJT337mt(DW@ zs(}{t?Ce@+QyU`!Ypl#>?zGU$TeR>)~lo2 zP8(fd&M|sQ^|l_6m|w2he5oDnKiaCIQs|U7e7)n%k4a4t6m#!Y5-E=j^AO1nc8MyN z=emdzeK}uViUqg-j#9}82RA&;x%LCS*Pv6})#wTcqW^;=8ncr){Cj(2_+~VI5jzp} zgUnT-!--l@C_0~-ku;EEunSnE-wQ%=a{C2@IU^O-YkE`vf$F67QgDDs@Uz6^*SZqq z(1*{8cn!09hFC{#MVBFpwu!qnd(21C zfG3I)x?g*V@Q|!wZNwaR%aI0qe9wbFSEs&1%pcvFf4^*bj(wJ z6y`+lKvU{4SE3bimG7q}yOumY_c-hZ@KCnjR-G>?1}P8gMST7Q)UG#L0qIPs*d#%ZKNONItApt0aGo zbpOWpNMs$Wj{wPq?->N4D3&#~Db-+ag=dg+z88#RM!~9!%wxT@>bMFfzI@U>@r1Ph zuavF$8#eH}eG4~D*qflMY=R#PSOStWis^K?ZV--oUV#pmr}Yj(>WX()%J(+CQlnnc z-nSV{N(%@00aeqrFCi68W0@DP1Du%UNHcuA>MuoKZw6t3prJ}@|yRr1|- zm5q1G@R>Y+Yj1vBHhh&+_>I52#r`4^hF=1G9~GFLxy!=%J@W1KTd*bS_;!3=u_loW zW{{E5FYV|jz%GK@6J)A@zgs5l&1orPWHJw*9iEAJKB=68)e1ydXKb9*Mx%hFhO82y zp84cloMR$69J`$)@jLSn)DlYdw8_J&6M?iq4hXmO)YHA1*2t!K-~<}Olum|vR#(mS4=osNPZ-Z#l( z)Fsn*&0@rEWqw`5Z(U&IUY$X`w@Ae(&`t~m%N_xn2)NNXFUB2bo%(BXkK77AC!q^{ z7TO73Dr*Rl+gx-=+~=uljwLK2$PHNwkp(Q?MObf)>t?K$=VPgT%?!<%5!?fuFu=L-e8B6l8&+d zGj)+6v!r=yq~1{HWvt!Llx-T(Kh114f&bk>zzsFAtshV?uG?U^b?g5w+J)-(B^WMkb6m(i$|6QK*aItw?=O0z_GXxnuS1LYKN%e&Pb7U)suQk2Ow5y$1<@# zd0Ohv$=8G1o5?`Q`4&sq-9QpEqjMZAppt4Pd|u#PNV)EMWF4kDj@;P(2ssl(K0Txh zo>PjzKR<+a27Y&e)PPMp{WZ;ab%_)gabND5l=HW5 zWP`h$%E*kxR+rbzPIO^-+}00ovp1a?ThJt8diA&P9ISEJ-JFi>@WboW-rT#;S@d`| z@>*3ho}+ZBBBFRwh&68tkTTwCku4+&-YDl8ZV4M4qsrBTRg)o`dF3^P&_{lt725TC zc(i9jg`3KlWa^RLcq;acoiB0*L>XCTa9;1)IHqbFwoxN?9uK>kBvLd zD`578F}cv&y~ z#;r*Q4gI{|1HwkF9WtgHd`k)Jg`)twg1wPrl>%>p%JaY@EMtL z^}Y8jB>e2^xRI{DjLg~>>d0bq&~+av^3(p*0W}};=4T5W82g%h2m_3EwxQ!mwk)Tecsce^=Coz`}FU@>i6{~Zhd)UQ?2L) z@ZG-I}78ZI@H({&Kjm!b@-8gNH zNUxqv$UemNPa89U6k}oaemkYk>iiZ*UOEtHVJ*9Fz%6oZa#Q}Jz_?-C`dvGt&C=b} z|L)`L38wQ~k(1Ssp}ic?7`zI89N)m@*iYtWR4gREW&N+{oFgJYnlkIEEj%J^MP8@bW331}!cvcSv072~y| zD!q7dU6tcrMe_Srn(m0eTBnap(q&ft@V7lO|JK?`(ZLuAX`74wI*bB+QwV`PJmPhQ zacaAS$G?3?=d(tP-!Z6p`4e_lg(2ZC^)i}6VG2H*1*X=s93Od_>5CNw*8kQR+(h}4 zU$koH1ToIO=C7mOX?k|74kMMN0P5~VbIu{QwDZejo|3=1D-e{xRNmJy%G|YUH{uy( zqQWm@qn8e=wr1}`hM9Uccjg|v(y}~@W?vR0lyNnfNKTS0HPBccyE-<I=QZW82WKdKLbH9#uu6VE?bx$;T)uC>#VlKCzDW0EmmaUz-0ELbR&J}z^ zxVR%dWf6=c%awkaiXCLdN+V`nGnmugoESE}Hgm{edWa)B_m%i)tI1S2`$i(ColDeE1~mE3!kMFT@}6>owwh9pYW))#jjqn!u_$3M=b6S3KsRRX2y@ zGq;9WRpOO^ z@itk*88}!#2n?gWI-MXzF@!Qd^|aN^gxcf0K~`Z^7Bw5z*PF?>C3>+b#k4aE^>9)p zES?YF;aVjwnDt-a=re0#hFG{H$$h3u0sLv?`6*>i4fWaA8Evl`R-2=yBhGD9e?o-z zyCI(RexK^#5jtjbaN)Cnj$|ZY(ytX?l)CBgNc}sb`-PThxuh#-0$P- z7@X)AQ9uAq!aPB$`vM1tobryLa^$&TKsK*7+2av)&S#74!$+MEgIHDbDhymC=qaUX zcso^jQ>T*(8R$S7nA1}3JXBw_(N*~f>DJxG!~ zU`h`T?&C!Eq|YH1N11`*q4k=oD`1MdH#2M@G${%I*r}z>-|l(j=)+Gg-*K0QS_hCs zyd-Dv(**67OI5@bPgL23b%VmsJx5}OMrm9gsG%JeU01>U%Hvrg?DVfa%&D0@mZ|@& z!dv*LH?QK_5u6_2`iB!$@HK_BXkJ zU4&vG$qs3y|MFo@|H2DJd}+nZLKWY6_~Hz>d#YD+!evWxXq6F$+lE=P(ytcpkcbtu zzGAv|cW~eE0=IT~jid-{WR*7^8bP&7F75 zTp8oR)D)FT)6Ua8@uo>_SGpY5x5|ja4Z(c?U?1SY=jsDSh~fn?TBKz{UYEY2Q@?>= zNp7Hew5={9sVA`n{s52%R%AHhtr{*^S19!RsAgK#)1b^+?5Q-IooAIl(tNKdHeb5h z->l=j&IIc?O;EOe(eA3D*^%;S%@##a9|b!*C$wEln9oH^ox!`LZzhoJ>x-s`<0FoG z3eS?A5_|5kxo$u_VE^ZV*^Wof?3`0SEKB^=Mcid{@v-io65wd_`T ztSwge)p#zt7d-G0Iq((;Q_QI2rVSK#s?Y>HPZi$=1xIh;RJp?ReQiL3#BYzbfu%P5 z;^%G30VQ^8O&h3UtH4f6-n0>;H1LvkmRz5!9zy_-R+^Sr-{I`-cJ+(hJ~h8Wvl91j zT2^px{FL&>9-kUEr|Tl37rqt1M&5YUC*V$8uV`IJu3uV|b52)rIyG>J|M@C@z+nFK zHeE4!MBBn1i0+roGwuJp;RYXg{QIU_$UBOt7Hq0PwOnxmd$8T6zP#P`swni;;^JcP z*%wZM{nnPD3)?eAJ?*=svl3Ff?QlrM6xcxWnp&uv;U>@bVFMT=xhL|Qn70J4#Y(s@ z@CJQmlrlWO<(qJ)eTL z^Uw^}1j_#E1%V7x*7X^xx4d^Z-W5wYOJ17Y8Xq=j(u5VQ?M-L1%rvBn62|ku7Nh-C zx(YX7ssI@=&aRtA#Atx6%EjJp&mM#)94_$vq*J>e`BAJ_PIuzpLR8r!3@!_=tE(GC z0W8wSH{hfNcL+Skr^hP>BwEulAo#-?z*Gk5KrS+9{A lI~Z+bGYSm;pa1rpUP5Btz~1qwRVeW1ZgR>1PJa{ zEJXu^5Fmtn={f5>=e)nZU+-G){Ua-~W%lfuYhN>a604=5LUHHe9RdOZidU*iIs^p7 zocQ;3B*ggFz^|=+1O)!)uap#ZeM}D^x7+ztJZo2;zgGP~^D>oGy#GUlaAvNs6ob{@ zq+AOj3Ph<%T4Pe>9HiU1;4xAf3L-W4uNRBPwwRl92nNw+TPG{#=m$F+$qM{?746#t z&)u!nn3<9FpVe=vs#5xY|M}3e(dGa$*v~awt3oyT2A0oSw|G^f1`W}%6iV-CbLOr* zYV_gv+5dSOcaA?AN=#?(SKv$>6Ah^>t&$(_M7G?R&A0@!G8yy#(~!MdnsMfN>qBv- z&b-FUm%tl!6(ns>vSQlNc4=kv{8Kd>{Vh_+bB*K(rocR;}F+t)WlUOT7|+)K1F8C zUTM#)o$rQMzf^XAGvgxdqQ3uNv!P{G8?_@v^Usqk7NPkr+-6`x4_1;1m^{3;uR-G% zY}H=8*Cwk*{7*X?1F{a6nKx;734a+m)<|f*_+USgME%dt9dud=2^YaF`SR_CAAbIy zzqne>P!XFI%<|u-szIF_8qF?#-pgXIt7&mFup>{bMSV8ZRD&|CoHQC1RQX|6i)Qz$ zPcT8wHB_^Hs-_d6Yn3RKnG37FO-D%e>^nWgwVdI8)~(0N$nL6)2XV3A-&DK}yIj(R zIGP5#q6joA{oq|DHHX!vvON`hr7PL@_6qzjv#QM2Tq;q0jap>Oy`wv*!>To6d$Imb6lrk;>c6wdc~RyU5J%yt=SjUSe3VC=L^in4TP(%!VhWCUAH&G>xsFo=Tb}d z&zh6{WdSX?uD#pgS+^{Cec#oPIbbzspVZys9S~8C{bfvs#gQ(_TR~d7puwZ z?G;p;CiyThaCUk=KoH_Q54ARA@9L>EBFW zTIXU;zMpC>ZH)t%n+_9=xN-V12B7 zM7=E7VG+cP4*u3G0F+*X%*wy}hM}|=fjkqZCsfvPLhfB!XEL?jhc=B>56)G_{O9>$ zNTGQ8%=M#1y>3A3Ls5C<*uImeemVTZ!MfN}z@Fe!xx`6#aSfp-MKWz`%*L{`=X6UG zB68sdjVd!_RT=OTq@jGZAD2SM_4p-d(Z^y&5ItHo*Mx(AZ=~LNEwvYhWuDSWcz*2( zbsx!*5Hq4|0@mP+C|sH{gN!M1XjMk)a+Km7crM#4O;{ekJ&mt|Bs`cho!LN9sa>#G z3m?mA^E=n4T|pfhCz$?g_m1~V$UcVfD(+y&Yx5ZjYKg;};$J*E9ObAob@q4E&DCZPwl#9i=#rlATDO?W2`Wb` zEz~`L0zt&u_NrJLLpoQ1Y2VaMTzrwQ;Y_vnSXO6mJ5Tuehs~Ef6tQosR>8o$3F235 zEeqYMp`N$dlD*Lm;g99|Sn!tL;!Y?@S=MxC z9Gmu7;wi+z+v66vulaOD5zTg2UiqPDVt&)&Q0nnE4+|z&5NbfzCU89xKH=;)r?@lt@$sw zU)RrxmIYaiSMG4UunfLk>Z$dRHb#nxMO4d@K0dASdbmWb`1{sGT*JL60GHyIA!@68 zW0L9mSdg>oewdT0P!^gelPCAyd+i$#9x?xs>xfy16byJBoIX2Bq%4zK%wZ~5@8#B7 zd$lBF;L2Fa=B8jcM_kN3tl)0sa+!!Xq4+U75&p1e>*}v7MjWeduAWYHYAgBiDvNz+ zWIIXjpQ%ZmgEUf*?u^A)1wXENYH$a*(OlJ=e<{*=eBt046fb_aSd=qUD<{=zv%P4~ z;lgWq-Qru;k@A+Qpk#dM-a>}&&Lf7{mmpYks zm3lsYW1)I&GGn&Z64-aB7u+S2ES?&rG$+OOM_EFI|Dzhu@o#cOrEi2QJ+m@!d(F=G z=;=!IK7y8ZMDUhsi+81yx|s{muqhUVe9`Bki_Je%J6(R~qOXU=CJ8t+C9u1)F3FZZ zED_~wzo`{^1$G*eAy#W0*_s0>!$gHthHUUyZ)zIT!L!lsFZOsTtA$Owtnx+TLu1*$ zzg6FO-7f#L$IWR1shES#PyhJ955w;W{8_qWOHpfT;Z=KuIaRb>6s3I2 zyxHSda!czMPx~20gB0{aS(?47P_o|_tDA)HT-q*H_65&M*%~(2t6u)k>4!fe z&~<)_Xp(e|zYQzMtX+b`R#OxT19X%8D#=MDyY@0L4u&eOxZu$`qmlZM%<+d<^_iSv zE7}zul;2f`nxiT~vBs*={*}XVzO98Z^|)Dl;@nl6gO7AP=~&Ls!n!MjQD)%x>vZ4m8;ymsCp4Q#|opE>a`S|}3gDhGdlu~`FIoRltpJApn+xcir zXst})9&92cU$ni`f|r8yLSse{Gi!MX`hMEl!5=nN9n_In<~jB3DUkgeN!Z-XmFdiy z{9Kk)?_QleSB;}pbd_08HQUKpv$L==11-HOPOO&_B-if83D?6OdNCQ#{GW?IbH+ti zu?kraC`qxKzckkfn-?VhW<{(oIJ|_nq7ov9HyusWQGm+Wq62FrG+&?3k!(K0NgD;t z&z@C>>M+pub5sREEHz|3Ts={7m!2^+4>l!B_cB4L<>9nH&sT{%dl@pN^ z4p^(1wIft`HKQo!KQN;7j$kJtCWQC7ldC7U_!m^AA?s^suK@}T&80PIuiA3~>l)JW zwXij~%}{ArmC1abZ9o=GV6I*oEqs~9)4WEU?G5r{BVNqZ6gRjZw##xw61dUR5@dpc zCO+d-WxT(gsoP1mNggZs4v*foem37E<&05nzm~HEIg9_t78;58s{K+JRL1Y+8SVeF z{!DXiDke#aB8&4p^GVzW?3ihNaV%!aQcx%${kZhr0Z2qrq~aT&99)2tM;d)GOEn_r zI!!ptD#`9(9PAHbU^`9iF3KKDdQ@? zTNRz!V^8(0p6K;+5y!yJ*_xSk@l49?T&fbI&d&EZn4Z5Oi}HIXeU!@mLT$x^Gz(Ry zcNF=Wc9^0O9wCpL&OH}&jiDeT&&2J`5N$g7DnIK>36W21Zmu&=ada(mS}o|S+an}8 zZ@%9MG8wNN$Kt_Q>fx$ndncg?8+p{}qRI-?RzDo&Z#9DR|y5*q-D2UQF_p=vB2L_l%+Gey)R%xBT<2 zhq4*phlMypz)xXun_kRPL`-m(>i)p%?!CWo9bO)w8P+W7BIjpQ^t4Ow!ZF{e*A6hW4@jl%W@f(Kign8Eb6PisVokL79ra+ji*2*8oeU%b9_L<6PRcV%Jz#flCw<*V zJ`blReW#P2w%%K;-K@2%V$TE_JG1_SW#D&)HfHL7{gln6+7EIGRXF6KQR7};p2f)c znAhB`TK(|T7o!8XCM`-)cDPp>dQyRRyWv7<@C%P$FK5=JSVh`9@8+%<%pAA0#)7X? zs<2nW5=BhKRGnXA?vWP;d=XzU*sp|)8um9o^yVJ{HW<{PA3(jnD1VEI$Q9>vY~nFS zsk0o7N`}XXnp$W6U zn^jN3rW#Vc+UE*&v=!~Qo>^~M)7vjJbg5K^p%kv3axuX>agsHRw~HA~-|mN0N$eHM z+^p!iB#`TXTNv0t-6ebKr|OV_AfbDbb(yy|ZuDG4i8+`%A1X`}qDFp1&I=X`wXZ!! z)cVaeSR3}x(_0pN@dIV0sx`Qf)2l_fucm0bQgyH_;;g1Rznc&Ib%@P2Ow`=}FI@cJ zeD^`ja%#xwJVI97_xHD6?={Q{Vh#nDDoxdS&HJyH7C-xiMr#~oVi1k3#w^=z#_hup zH7!Ap+1VST>Q}$9H0*{lI{6d?2mX3v=85+5XA0P}&sg*Oy%LOa@#pzxpvsvyZMdyD ze-_`x?<&ETaiB5(tzuT>p=UZjMq5u$6Gq5NLz6u7N&N^Ha(ngCNHchc3f8@s06eA3 z{f{iAsfG~LFU`P{X=`XYHW1%yIP_n=B2f~H7d>sgTY7a}*jlq|=y*of|8x>LJE96W zTlSnYLY!As&Hf~Ogr7=pjF{1>e#u4du&uX_;q!kNyt5xl<#%lMP7n2!=@u;wjlqnJ z0|YgTE7!KqZoOJG#hotHcxS);mwxtEYx#Ry=j7qN8EHDND`(Vuabxn`H~l#>XFr$# z|LW{?nM>FGd*}MQ-bYqbiJL8>e=BxnANBpvoB0_peqww-gwI+uoaJI&eW$AGkK{H* zs9BS9{h5!AN1gkt^(g1tx#{X86r}&wXVmw#!C+rl9=_qUL(NiMmyh$5ij2n*ZrL{; z-&^dio~)JS_xG(woxMzHwXc9L{N8SilY8EdL4JCaRXB!eP`?>{p6 zO(to*=4~m9$<(C$cbkuhG$#SI@INE`>BC&ghS#!#I*o7vliHuJ%I>c{T`ft6}40SwCg}XG!@;Lfv!AA5y}?7v=w7$Y@zeV=vTNP;N9Ai>AiGP+ z6+i5Kus^GhD$>QYt?CTY$d-`UK)v#^)YNE}>J|Xber7cwr^!e5N=1zi?^W|5# zJZ_Ley{rAQ6B<2STf1QhlR-9r-rhWGlJ5;Xy6wFt+8i*u5p;ba-A2xjD3W&Rsp?=| z>pook?D?jq_u{BVeI-A5)d09ZxLZUvX5R{d8%a>r80mHX+Iy1&F491DmOeljB*-3h z6shYRv#nU_XE0H-HRK>n@5V8I^laoRI*N{R zMmwkV=66o^soB*rBRmNA81`@xwpip1@k4gVW4_Ii^I^7UZjWK>_92Ys zsMJ{A){NStfWTV>Gcj9TX2RY_K?TfN4gNiT>ehJ~ahSc|bv7cblvPq)1{mEjKg!_ z+jVvG!4=n>|F*FD*W|H`HycJ^48NJ2_t>aDRn0zoD-3G$+5X%gMRk;mxw57kGx3=F z1l)g@)=oLL+ig_KY}oe9<8+&DG%s*U4+w4I0WP>R9Bv#i4!Lx-?&5dWt`=J*-)m*J zqO-rq8dPjANVn>X9q)8|j%(e|2G!wCB%)_)o|s&J%E<Qlii>6RPCVH~VuJG;zDps8q2M-V=CJa20n{IaP|yYtk)jSAGpg>(pkd7eAF z!2NVXdTA!iK6hZCoyqK&|Dk*vboDl0z+i=?QCM5YfkP$h@ zww#8N=B&#p75dq8+9JEt;|84p?sxv4WvsQUMbz@PnbBC3xE^e3%DCCBNanc@{~T?7 zV_~dO{YFr(PO3bID*!(FV$Q_V4Se8scvv1BWc2oEemgMbco-^RmySJNN?{+{T*K^r_IwI-@&69a*(_*Y;*Oe)BvH#iA=hs^EjuBP)menGVO=C1Fd!~ra!JIQlQ>8JNBhPV8Z!@?NJw8}XcC}=TbMvp|TZp!a zWlS*cCD;rzy%aqhF>BK5RB@QGc%V&%_a+ea4wYxn2qh~6-sQY7P2MeXWQj{>Y-ETe z@9sU{PnTJ3dGG4xuPaBTp9r#+-(KlOFW;cHr~XK zt$CGP;Y+aR`dsg-lmDTt_n(mrgZev0rPBIu(n?e6eq;OCc=G1bHu8WSKuuX|iv?1; zf;GNPY$h~ua(itT`~GX@kNr+N+@@yc(&0Eq6n>8pTm0V3fzDl!a~G4gnYnYPF%bG& z0+XTU@!y{Q%${mU-vX<|9L#}u6SjxVwP%Vjnj9}Ex_x`JV{3cL4jxYBcrNxDP=)kz z{9CB6Jwa?nW@n}ElGXc0iVP|QTU@lKJ0WlAoy?Ry9=MxhZMJ9+>&T;e?sNq0I+zJ? z9{H9Y=s|;7sJ=0JELc!lTd~IN&*V0(ODqOsobw5|J+(!)gy=jzC~k7@Zd^E@bXk?r z^-8S(-z4-ckL;)G59g}=_2L}!n&~9xhCCChxXB)~&F|8}J=W||IQ!uC7Yy#4$5TnV%Bm z;3FFSp#o6=*`N3wjvIEIO3KW8SIX9pA^7HDtW^`oCj4q?&AIN@PJ+yS{O4_x0g_1m z)-}(f&9ueppafxBdj59;Lyvy>wJw2sXR!rdqt>_9Zs9Y62cBobKt!gvn?TdE&TmQU z9i0bDZLLpw$mVowcB23=R&)-5A0K9sFnR>kH53H2?&4vj4Zxk?%`w5KeeO%KY~yta zsq$tjY!tuLgq+}D4J%c18?8E%RR}(&UJW-HU67@euVQ$doFP~$ImdLo`$Q+i*z3d> z+;!-0-c+EfEi#L!XH+)X`@r8-;sGrvqr#*KDmx4H+1=47N z5`xjLS^@YD)ajsC2$8D5N6HlZUU?=Iq_uIj4)80))8Qdu6$aYWU)R~<5@n`Ee4eVX z%Wk;=&$V}~cQ@VNMFEC3=QsEM-gGPI)yB!0e&K&K_9>A6JmHQT-}%h638WXFTG~yb zAANQ-FGJPrsLZzQId!Kv40D}=Se^Zuv>t@__;5s2^MxZw$+Od%ZlWSo~Dh zJckq1{qk{vn-;)*y1SubtsT1;pd51}Q~6If=T*0|r@3(MNSaZ&cdrd{=b`2j@-5Js zZi+JsFb5S|4qTj5KBvWfFU5!tj!=!|$xy!v$-06Z&DZ4Id+#f|F%Qs}8kEpjY3M+A zsKt1C=XQTv&MQ9{Kaj42J+xM>7bMDIYOA5-65}K(0_QD%YXw^|W26I?s3>Zn*@H=i zD`0ut(O!SEd^7i6bzH|R%GbR4_M9&?c4-v5&ks4ze(9REY3e_sU-mjn?<2Uw?NVHw zIl;dSh|h_5%xIfA%m%Nk3qCZY1La#K+$j=#2R8t0F05IGo<+-E+>Ca_FHs?K{`NlFEf>AO)mmxO_ctT$-1Yic8Qq3>7ouzYsV>mczNLyHwsy}_mD;YfoEMeb zX7<)|SHi`9(kH+kE=!kguR5YpJ(={o>W>SzlJ0d`kScfu#F?d&~66Lh16yoe8emke+=F&0f zHn@~sV(^eQzIi=snD=7Cg}w)KD))NAO9QeuIT*t8sc~i{DN_mWi#OJ~q{WtX*Q>wJ z{I5rkxAV{s4SP~IjKPoWa!CNRTihn(OLw(;)~}l$ge8Nb0R0N?2~9H1Q~4M{08mg8 z@g>wV=F!7_)sC~7m2-yo=*UB_ENSf7Te@@2kfiPMJg?U&cOr1X4LuTF2=DPeFTmm{ zDqe@4);8D8Aw{D~CMe%1WSzNRXZB@B%^v!^1D-3@zSg;HUtTcz5Rz9*4zK0p$k2&bKy^-_uKRO|5i*_`I{O8Yn&)@w%VOI(XSWdhe% z>P1k&s;u&X^hoM`VNRijo@erYo()!0;#AUAYvL2FDB1R#syD>LC$+!iG7bqXHU=OR z!rpIumzmM?-bW=VnhrN+4{7ObN7A%da#~Yh$%j(sLD6S7J4K_%%Nf?nmHgIsm8~wx zj|D#TP~ON7Cl#`^<4$VJAgB5EjE~P&y8K{ZXL+OPU_lXiz?|vqb`J_2B&Gk?T)?`8 z3b_-sp+a!aEhi7-sWuGB6p7A+Ndheq{5ORLnuRsSLF*Yec1LVf-DZG-opgSq@+{lW z7yM+*h()xVJ59bNbzuX!UCxmvd@zwfAYj)SXI?eVtorLoZ4ieq!#ZTdK;e_b`fq>0}U z-IP0NpmHHPY`|J9osY=o-_}ulDU}}`-gA8uYDFW;b!^fatYf*YEi-ga=0%CeTplIcrd!_4LCs&qp_W?g6$~Y5;@8S_8HU&LdPPrsuBqYnG<1 zF8NZts%yr;!(3s9y~ucO@&iONRiOD@RYBOc1S{^xj>(M8ptkL_3R9dH^s*koGi9S{ z&L?d4%GYU8SNs8kn?j)bHO~-4+Amt(lm|JRfmU{mG(BF@q<;?6#{1iVZ}-K5rs=#b z@}{Q1?T`KRJ^E_5mG}KrMgF>9{i??etTw`G8)@(i7)J0c^n>4EV`3g9KCg9(I3v=; zqYH$9sydhxq>t;@B#7-JSW*%2@>`;Hwd$&yHoLeMN@T4COea*}GFN(TZsnh`0sIU! z>*h{#FW6v-prCbUJzO$mGw8MipDM&2o)RFj}RTx^`XbywQGTSec8=Ud;eRw-cYYIbx%I zN{WBrX-H$@&OAzrRuZbH>gU`@<=$gOtmQSCI|Hz4XWf1lmbNTEmW+uY#WTrA;}1bG z{;!j48WdPjFLQoED+^4hqyo$tcprOJ;C^piO!J2(x5}UaZ%K70YI#)h{${sbZ%46K zE(ga;EY47(5(k_LgU2Gvbk+MrW1t);O4n0 z|LUBlJ4_V6i--x(RE{lt2sc^f*fEgo4r1{s*M=_xM>NOW@EGg%-9N&*`R(3E^b0u~ zrVLgq`o4zvPKW%gYC%wK9-8s+q8gRxn4ss}3&sZR7(C552`YmNALgBZ_fPZNVSXWq za$RF{ZtaqwsIHY7Y6Z_qh5Sa#!^|- z^B%Wely@ATfzTPt3e*zJy$yTSk*E{DSELqDR&y%z*7hV}9+UYC?UKVK%vsSCjz`#C zZJ5|GmS%2zTRH@xJC<{qX@`a&3RZHw7{TDw&cdLf6J~Up$+>e+x#XM-?)F-F^P8Tx z8w=8{wQ;9@7tY%dCM5aztWU73e1+gx7AAcy4OCQnDYu@xDGMcuzrK;&J&nsoKtsJ> zV2#s?)qQ%8drA^6ST=o&KWQIzvIbgxW&(-lQ^-wu&2!HUoVv>;3m}S|RFALQ0&z%h z%Fvu=)Mg*iG{FT*e%yNvqNj`Gq~|}y1I)ld&4gouiK;KkjeXsavTCwZ#CDn^E7beG zl#*OSJDBA247Bj;X=(WTu~)#ZUkjVv?W`w3Otz{*+J)yVGX!_>QgXf z>YE|HyS;~8$F6Q)GCfYnxTPLP8vbkj3WB@5GEq7{KfdZ}1U15)RH>D|cR%8Jg|PBF zq)p`riJ4FcNzGpe3@#)T3PBu5L!O0&Y|qL$E4d> z&$#DD8n;n5B(v`7fF%;9x7-#7LK_i}0`Dq7uKwogi| zE7tr0hS1fy%va6QVSCdIb3S|akoB`)D$Ecq=d~;BYyYm3>^C{?&5UXyM5PmNrHM8& zUyk-T_tugoH6Cn`!440fGSK4vxeTG#>u;yxq(Yr}6~F%hXF+d~@bdK&W#amH$3lt* zentJUxb0@}a3ov59O=4FMzh=GW_@np+XSSEoc!0sD|00^V|mbDBH?`k#P?I;z$1-4 z+Bs}AQFrD>q-f?7Xq2xRw}m(>G#6>$O3Wg!L(X3_M`o#N1ub16d=|x`sOLT_?az}?f4Xq3ExyO4u5*a`9QZIcq)N%Uv_Ewmo{px$anIHc^|PLqZ8+)i1|*w zBdfU-V;kpfmAbjiqi^~YQ^a%_*=cSxMsq8sa|;U`;U*^yb3HqzpcydinHtJ|#M6KJ zO!ddsloK*)&EIAUjDf)xT3#grh0Y$sE=MEH&FE~#N(zEe*i34m^sOT)tM3lLFkh>er5b-`g_=pB}A1z-ir51ZCJM`g~ zTV2-^M%qli%(!NgRd_BdjHiHCVmOKC`xe~A_Y$1l`ZDjd$6`xEAbMU!?3Rs#swm{D z_6W6SB~A1aFmW0I`xDJ@1bNVsun|y)+6Bp+aul%UezB4c(n5ykg#5Kn;-sC<(&hwB zR1b1`+nOtxV||dgPEROF#0m%_3DQnC7YWS>Fe%2DnDsM+RjpjljsyY8X4Ua6hB4?(GxbHuuC0k=-0S zTYArcGRoITHf3uj51qA=n7}EncFkrgqB6dxqeSGCM`UvYUFm3!xtH}Rfy#aPUf;w{ zy6fvil+~8=;7Ktw1l_<*=}50ZVQ`_0BQ1~!{PC(Z-&g!nUU8;&jW9LNUu|jovM<$f z=V;#M7?bEteE7U4Sjf1Yd)gqA7SH@ZL4AYh-5_Wp;xD>{UB-&kr{UkG?}mCMz{H#P zcdm2bwvs}oPj(E~vHeXyw_p&&4fa7kq@7hpy8G^hT#R3t$xx4^0DKHJ<>d!=cL(1z ze-_d@WuiTixCM%7rA0khBRmmsNLV@*7>*jSv}7DkTSLcmmSDlbQu>SXh$NXgC2>wW zD?Y0nyXs|$^Qo@Yo6B(lNpaY-wa3OHpeO`{o_W~+u^PbIW-#c73vrTs*uE#yq`IKg z*Jzx;`}LG$FqtK$HRLiGtw{ZMALz`PTe3|bn?JHma@q*{QLUkn09!bV4RK=1nIO!( z7q9aJ!zrX*FS5sZWOXKR5Y}2~q2hG|{63N5^As0*CA3-QggOu~i>O7q+ z3MroY@+9jKH{o?7d8i%v!$tOjjsn0wR8;;N_j@zJF}~Wy;29@-NWs(y^+BoJ#!YLf z6FZN3pHy4Foms}{TA~oA&)Bd6e`sJ&j*x?;fzdG6Y*YLriG3)`uE9c5;gRs2MP(YA zcxR%oNLXO=j7%q2n?e4XF&&TJeq9Wk_CpX)e2>sy?TG5sE83H>2b z8Hp|E&}dOa>V+DXb{@t-Y*)VBP}kLnmhfQtJkUVz8Djqs+OD6PCmv`h&Z$~FWJ_Mu z$TWB#D$PI}DYQU;(#OLwe2RPTeI3g^?I}8M`nBTD+dIPSq~Bt+pEtx?*?_soW8Kdf zp9lAZKY)=57YK1axU1OfenT=jNX^4NpYhg5i2!*Jh-+34r@*ghK;#&ZKb_>WO_nd-tVtB{<*EOR#J6<7FqE2>kw7AW-`O)t zAY0}k^*Kkyy3 z?8ex}xJsu@fEz=Df+pIXwa$M+B->Z@v`08(UIozlE$K%}TXcg0buNrvCSVMmFP`|b z0b9^r>gz1rn19NcL%E_KPt#gHb}ALwh%fZyUYiZL8O!P-PyHs@zy5P6;|`E$GYlI^ zj|vqIYY6-G0CY1gD<_8D8j`g%KO^~SZYiG|j6a~;W_-mtD>4G8gxusVkMzk$2t$2= ziMR{3rD6*6By&;ogH+6)6Hl3GF5|6|#jqQ{#aE%mNSF+_wG{a+JQg{7^J8obv|6V{fd1VXP-K zocvH`{~*CWdcrb%GNs&j20D`)K7R%nwf>{ZEu{&VJt?X&|W= z-Tp|x(%53;-O@IY@R}#uiZugA4tB?czmw4G|>l!J)2fB zi1M&p&dJ0O-TegS4R?*n|CK8;2>a>=_jh>dvQ$w=@>`s+>|3IPvZ|5--&00fzzIo} zTfQ}k#BiWWfsG2{$(| zNLYU=|6qZtAZPUq6onO~fM-;I0$(qvd*)p}gr#ak1tU=ejaISTaQ=@oa_-*`3ce*3Qlu->**paO zZSKPtM*y;Ko(RhEdsCcH1W6;buZ`Z^V!9IEi@~R=Hf4MsTo`8qEL68Tn7$g5aXiv- z-=-mGfKBQiWjFFN(l(2XY;*_r0Nov`zL2EerFMRsbtl??JQsQW`;(oEN$;(%Nqt)U z>ay|2JFE9W`Cj2!+CjMEz&>wHdO4crvE3FQ+xl)N7q+pi5HfU~v4+t$rVF%X5Cbp3 zrWb$L3gEUfKD~)cCfi2-=t{*R1>&tI?Ac)43RrQF&1yX~3NXx@!{tl&NrY1pp{lKH z6t}%B-;?-$tn2oz2w(cs^UwI8)Z}NY6xO=i`>U4SjmAQmEpf)0V!Xm9ZBFgBr+@!h z#6dW0+n@CN8Uw!Snwl7}W?{C}E1{E)7;-P68@w0%CT(c|(|thyb*+^C;u?~aC>%TQ z!f;`r2I%5ldV_I4(i-p873bu66Y^1kh42Ss9i-VaC|n%K9(PDmNPDl?3Gf*2x}kQW zWWcaOa0k=fknIV_+~@X-Mp(3*?B8$r0rqee@Q38J{6b?uij|Im-^P@PWCm3@H+k$2 zGfh`XR!qFF$g{8@3)T2%smgePlO3wbpG2urh*h%n2bRD^qbRI7?3THRN|#xclg2q@ z(%-tAS=s`q6c<|y1L7?U@uWN+47J7|V+3g-va&GJAK+0C@?p)Xx~r>kO-Y{^p$7GKVe z+yo!fA}DUe$KW(`9qds`@lQx232CBq;*DdH^-3*E<}4AytM^Mje*3)vnc=Hjx`k#< z5zK&N`x?F`#Y{Y<7rC5IUgg{Q+@CaGdm}K0IjC|{9XDMFwkLTc*A$=wWLNttq%l09 z9beQJ!jt&jR&BnURi`Ogqs};rWQes2QkGC&4UKBNS=awSJspdoH3!@p%EJ>ol}PfuN*o`;Zu<+uNy20WvIikK=_FrUL5r~9d@vD?a&_C8buZMl?!}OR1AjUhjEtLRB zih*T@tu$L-dRk%kl6%E>+w)cb`g1FLH6wOgZ0IY`XD50#ylDmYjK5(X3>?{K|NT&L z9v}IHZ3AqUbj!~B@t0)VfgfmOxP_SP-7NvH>1p8yXxm%cq$5qaXQlx=Yo|TUykTg- z0Juxau|OU)n+9h6fZvu0ihx0$zKXrS1^DhoDdh+~5T=q`>oy?){)Oa>@BdT$6X|Ku zz71-`-_mBNGDiPun7^hwI~d}OgK6tG6DCNHrH`CLstd2VTl>IDK==jF(1$ z_Y-?64d^PDZR}M8&Om~oBSn_S1ItfkU?~r2yEz!xhPq-K7mR^ObWmg#9>z8GhbR4A z8=Bew_!}^jH3pz}!3N1p#v9TjqS_8R8}eU*x1Iq~v{n&(&KNA4&4>)_+rr-8P|!UD;z@4JeaCszy>{;Kqw#&ob`; z){COK@X6Xv@bPn<&=aSvR1rMOiCKR#Xez+i-%moK5+d1_#Y1%LaNPW~3=s}$HpnDJsnI-(UwEr$?1GNGqEaUO<| z7D)&ZcC>o~>W2y7CHP~KSCa$~^^5MKtvfA{J;FfQFQ;2J^y(D*AWF%I+3nK-huqx~ z&i5t>QFur%e`WFEFDSGxAk$=GmZYMow_?uDil&0|Slx(k4TVQx8WI|{gg$W;a5&Fc z(?TXqwxApzV87I$&P7zX!JOaG(@wkP6eVcU)0Sq1bOzmyU`rkq--GOi%Hgngkpo)@ zjR;>>Me{MioR%2mFU%M6vW$(6tspBU{&Va~nb7-F|8(!C#rKL#{SXoTUj0(2NrLk= zX8gox8u)ZBuKQ%-d(v-K?+X(pL97qmXZ#)a8srPy#N;8-2^QLOLHT3J8gG{fEoHwJ zhLvmCmquH-%m5~Cc__~w-%&pwMeKz?d%4bDsoTEz3-&ne_><+-C8XR}g(pj9@ku{0 z>(b>;5>qjo@ILJvRmB%q6qCQ|DTr?sGcNj}Y=A_9jg#ZNuSS@)RrGYs*8Ck39F8Xt5r; zF;*gNz^q{#VPIHmGmb4<@Jv+24c5dq@t}g+?$ZKo!4f&?;)5XyV6^FO^Tg*@kq8EH zPD{~mJQHv88u4nk{rKEQD%3#^%rZJKv^yV@%mL#kJw=KQs_igRj_-Ic;dPoTdv6(iz_l^wJaO^idIc@+1S;99wKu=Q?D z^1FWX@a9Yze+Xm>XTIr0=WZ%Ki1W7Og`s zle!tGP*vP5fA5f;eE!cxH%Y(l*fw=>Mdw4>k(AKO@45WbhdE~F2OV#yn3=HmvwP3) z+t}bS>ndB91{utG8cqdfyZz~pn4vO3YK=0JrPMJ`{J{b?s??8SQXc^5cOmq_?Md8n z!`pZy#`{8YcF8@$nu~rW4ylR)roxY=0Ymqs#z>bZr!w;MA-OQ@PgqgL6UdR2vYfz% zBtB^Ih?f?Z&E#by1P}%J7MSSaVYb3)-vEjo?td}-XP%f(AoiMdNgRNY)A8b_@R9F$@pj=h2}_gC{2U zgEgw|5tS^v3=a&H%CH6urb6NOV21RxXRa*XPCDe;>2Nl5W1bRT;q@lO0zd$DM;YG8 zLp8!6$pTSciaGogEJVIv;%Kb#6L);jT51Yo(;~k3n)Cxn5D{va;7oEi)myyZ_2})w zK14WgADm$R1d(X+kL0g+fN{_49^eZgs-WHiI~0m50?61u z;T!suX+mcJ9&{(tgUYUNeo2)u@$R{IP=n&1C6_&oJMWwy*bSI#VqUsySC)6KGxbTs z<>k_coZO&tb^3ePpZa66xI9;`IkV|MYDd9i(ID|?nmL88O&q>jCw1+m+p;_DHa>|F zueyPGQn#5mgf_{XV=4d-Bc|{+VTaDu?9q?uEx#~|ai8c}nuJC6({m)?5AXrEr6a+$ zm-s=BxT9_bBL{xGj^Al?vU)xFJfJ0%7@8d~he{$5E)g2?+U>M=ci z%4#bEEnApzi8DJt`C;t39zbb{JoYzMQ`349H4e;Kmw@CO;4`H=zCVU?@%I9~+ow*d z5xfd6{(fU^yZJ8XVL>C_3I<_Qx$=-;QWDx+D__7SNq0tN%!{-4WybWh6H5gXcP=xr zR_I!0yet30+gut&EPZIQe>F#~OD2%TBLW#Tfs^Y7d_5S4Kfi zZSDN`WD?0sSP)U=#JikVd2!Fqbhvq-a#oFd;d`f}R}X;f0}aB0LL@xjZoZ}3i*T}U zlcV~*&Nggy)){wh)wr2jsHS$D1M%EUM@D~yRk1Jg>ie`~eAp#il#y0=IJz}=Nt1_* z3gMti%{1fzg-2C1;gg5Mu_M~zob8FiitarL)Q8d9`tduzq+|R?!4aV!PogXFhOzy? zkt+Vm5>E*-bpvdC301_u1Q5zb6%Xi$i}P5)BlbvAsP3xXAotd>NyfjYgJu%99=W zLb!NA20jTY`kpOpwxp3gY4rJ`sD0<}&$xy*bZs!6vDcUA;I-h_nHo+b0VLh}LlGf?zwL6Z0$>|?Gbd13)!jbI= zCc(+p(aEH~vwo*hgo;vj1->|q_y4QzP2-_#`?zr=h7?0FA`C(aGl*noY-Me+(;z#E zNg70CCrh@mgsdTJ*6a~t7-b(qVTc$E#_l;?*L~er_y6tx#q)X27iOHxaUOFV$N5{o zzwh}&v#g8M<}qcT>3{63anUxc@~_|nm=~lz@N$WXpaEd!-Ca6XCJKRaZpB9*pmBoIK&d!CUY#O=@+}CUX)E93)$mbZR4aJ37-X(dexHyGi)R!&>HN9;Ze~*^4;C)9;1oERuAKjH} z9_6u>dC<616;TKtMz4w8>eD7*PfMUOeCG5*CnyQ_b>|0|PmT9GSb5Lf={M5V zqCb@}CiSmpl8!3NHK&qkL9#z~=CJc!QRND{-WIL_0#yb1dgd9Y zB15=bQC?rH!*=~BrB3+`^_PjLds%j`f4%W0aj%_{(jExj0?Gy%{HGrRv3p+=Zpd@$ zRX&|Qz%vN?)#i1I7zl%+W2%~ZZ|OK`R&5Qf?p7D?*mCTCy%l5jj89kET!q^EYqVRJ>)VQS)M&|@NYB%Vqq99#X{)UEsG$a z9Cy7jio>&e4^)RbD_2{;3iw7|Z279qdT)0c_?Kk-HL{-BtKBRH^t1b9jrWgBz%NEV zVfM$ZTiRSAhmw_#n(4lR^V9DJZGYRAo-PvSSJ}K8J-PEcR10$=3aQHGqHxM00`+<3 zb7vUi-%9d>dZc%|?>Z%pNqJP20rjdmK{%NL1>POaTWo<#aZ8TSMT@W*_%bgf)Lr?p z*qw$T-dMn^ZHnJbN6kyyE-%E{X!P%!a4;JvCR<8k$}+N~3lmPNij93cK|COZtFO+} zARckJb9&Ec^l&Uih|fQq1)&0$qh%mBh;lJBIcH$fAZyaGi)L&F$Oamfr$0xC*&}i= zAA;F2SB5lnEzVDLdIHZ3$erjYoa>wIo309&pg&y&=yCh4{69Gbuk-U>zCe6YX>93yx>QK~dFt=OTvW<7lo5yy*Ltq_A6RNzoJuU5^>%#^+}D z^lCU7hIA}HIEcz$cw!d8PBkXeq<19{$n|h-p@GR$lGzGO=@1Gw4++oe-Dd#?`4BZe zjqYT*$ik}-Bf+l;R;5U4b)RA*r&Jm&STN9Ov-MQHdQ3;UqW<)~wrXJ2UCzi|aSt>h z33+KH4F@Pxb?)_)Aysr{ZU7KH&84^1EKcf$MFhoN$zgZ}pbq5R(u8>hm z)H<+A9^4sa*@ud7Wf=ftInY@+alpVbuy7>)6Pg4M*m~!!jM1+8GJ3S?CcSVszZq~J zGquPG3+ppY5z4GNpyrLZX}9eQg~VHN>JgssJ%^+a1a>+8?qhf+ZO-_~;?A&Z&Qh&8 zQd;JpcC}7(qQOqa3aPz#!<|m;O{b(rQQa5Ivv@x`WF{1~$sEH;a1B9F+n{2F?%Ye* z#Tfw<78d%YuKcGJOM5N5(*+fCQ9Q5>H4WI+z_xm}tDYLqL!sc^jtTFt+tJ?{S8@2* zu;qRdE6TdSgu`Tw zGC+y|4>*d0s7sY&Ta@`3F>}*nO3z6XS?(UgJG}=)dvFZIQbJSHSz=z>UbK|5eq*EJ zTfl5oHe0$JBNajB3>Y((DuK2gUMb@kjE=MWy07T=z=<`aV~qG>che6DHeib1zE5K! z2a^>>HDi;$RHN7t^o5_JU{m4qhlE{zp*}b~%41g_Yw~qOnCk;L{?~gBZ^g}Bh29){ zST3liYe7>nVZxFqTcLmRalYDN!f^T z<^>+-kbBpt)9G#|bqP>tI&mK1Dgp7u$)Wlpjx*1lxoHJ}n^$q*m8k7Bu240{;f2n} z^Tqc^;xG@RKs@*LqsEkhNaR_Jg0EfBxve{?4-*!A*$69|A&9DAxgU@Ge-bh%mGx6` zdFXGGnIByfuDq)ggrmK*`xU~W0lRABX5VrbBE!8qZ$x^338C(#yGCkEjPFLpq{xOS zp@xirNdEIKUoP8J*iwSWZAVSy&|{ZXPeOm@L++hcjM`YTdMISfl5IA7-YirTpXIfD%MDfqlZYeU>gS^Jc8V;Kpq9zI(U z;~)W8O<|k5I{S6@Qb3H&G&|XgJ&3PBwotZbYH5yNP>sIHoli)<1gcEW`E76DZvF2! z+F4QIzTC}UPNj-poA|-fM4?CmA!M}UHIKgm$UG%2C8-4y?mQEAX0QW}MlaW9Z_Uil z&4D}bFJS1_`Uyb6SY0V~pLBxdC^VyrL1#NH3EdI=+%aLLsow>sgj+v6wVx&Ja}5Zb zlV7{?31ORll8XK8A2YjquiO>0ZCC`1)F%R`S(~9#``PlxjTvRS+9mqYK@~>O0fUbLXQ_*0NA9!jW`)$t6b$RHfF;UtE*2XCa7O?a1Fy~P7R;y40 z{7y>5{S7?bYs69}C*;)&4{CjTU_F${NCm`;8>>=8ddL`eR{-L#UZ$r?JLf#=PQs|B zQq0sx00nr?_tBZF-_FDku(-Ft+9DHnfH$|l8Ew}hQ_jBo zf$zGWkt#19tvkU?_668qG|#-AxY~Ix;4O$Zn2w$AzKTxSzNLTvwYRdz@R|1>3_82f{Xic6)wj3obzCYXl}zlv1L&%=A$3XCFBF`ut{ z%2#sU6?J2Y__9iB5G0&}uH1!ijHk~ZYP9C0YX~ufA3W#WpFz)|(xA%UliGd_FZg|8VmKm!Vy!{ocWI2w^)o4^`|cjG2u$(8Qy%C&$pqX zm6;^9i!q4wk3Wy}Gko;MVR@iVR6mAsQZR6GZr#2im<}5cReqdYFJ$}_c}Tv{Wp#<1 z{-vVb1w`Vb!a+R2tqW5hzK44rTFrHsvkh@f=vr$cJ_=&b;Wb1Rb;o0sHWv#rz^*Z* z*7LdMGE@b{(11@&xzU;pHTD*?=4Z$*03pu#!l7q&?`#;Z;F)Nw%%wx9$bsAC2SKxG z`>L=^kFNrs7Te8{ek>V|fs4OcSj02F!1(oZ@8le%+Kl4RRX=1SsvZI*b^4rXWC7yY zDUrrVhj_p;bo|^vgNi){)bm>7*AR3wP-dg>=uZcBimaWog~O) zdvu&Nc>>D-?ERrYNw@tpIaT0=y6NS>Pqwkw0*TaHQ(H@Y3sWRKn`DlRC8?ol#tw)U zW+^UfzKkCg$p@t9W~aa+#}<~lmKsUN`&!)2{KzjbkJ_~`?WA;DvXdsWuvW7~rP*s6 z*Aiv(6j*uCy`ne?geIvZLXl$f!@mM>^|3X}+@*>Sf7Ai#1f)#UXFd)8O_-p? z*{;JA%-p1+`lE6=0u;#j^9V>5*vl`d--4>sKQ4t_n5P~P9*pHF>2q`4?zU5`;wz2k zBIa&{Yn8M^7M(PCqONX#$t6IPtH7(Leg$L~+-&o}LhuTOy>$;J1-Cy-#N$rVju06o zj!z(~A+Y>%J5N?AX*&-Xwrnm8BxM%%DU?jRl4-wNBIduUGAJBq%o^k9`KyzV^L(m*B?mD)p=7`Gm{+W;U1>D}G z$TTlbk^_=Sy>$!+$#wXO&P14cMN_mM)4y6=Z0>?QC|${WX&cMeOBw7{URZC6Bk@ zMksyL7K~=`O?Ql2l3I*K4WH&jZotjiO#uh4rhSn2+HCSv*g=?U;J{{YA#)91i7y=6GMvTinnb7KXy23l|3BF zp8`E$&Vo`rplRge(awsUIo%~Nh7;euL^%#vAVS~nyK_>Rnac*im+aG_@NnKYbmTZ2 zrtXf1tx7@~njH^kv>PMWl`IgVDDTiaA$wu1ifsZtUsY3|rV`sELlA&M(w{s-l}_zL z=ON8^HWjrrJpq`CIL3u9F-l7j+bU}^1cPqnHc>!%F#UE~Dm|*uDiTd%S&STd1b^#8 zP4CEE<%3sLmZ7q%^3V7M+7$N=MlEoCC=;0f0&YUz2S6S}Lfir78`08T%@HphSs9OF#e4%NE#2K4}~UKnW*(jmE&r;5*$Ig$*aCdN7R0!caQEWQ?nfNza%=gAEs zn%{?G!|1*_`dpj!+mP$nuh1T36F7}~d=8(~!^h9tifw+TxOSki$F4p!zjC|e9W($) z6#?2uAA*8h@?BWhwaOeqM}3?I(Ed)k#yR7|q{}e(3RNsm7N!B)sP^a?=e68q%G|8Iq>TaE$FQ_hx+nJ|}od z-P2}?o#Tscnv;VP42$zzB>LZGwUpS9^ysjei@&0TM(5^^+b-$op`sY5uybvKi&|?D zNEEflufyl}4i!XS0s*0K`zg<>=KTT2O5yTpxiU&+?WFiUz_;Z&dpD?6LcUyv%Td2a zJSt(Pkq*n8Ypa->2(<+t;|had_uet~ye@t-BRBu9xRSH&=}1>a%#A(F+XVQKZ4@&` zeNrsiBA-uXQ+?rCI$xW4;P+e)D8(gP!wx|Hx-3AdPEqYr=fgEYh90`}a;(($jv6}S zdvCOm@&ydD>2%B*&%wxkQbzf-NA9V7+BQo(^UAZ9J)bb#JSgLag6jpo=vEom*l_HZ zTAYl52KYu76BN{c3>Uj$H-1Vnh%UirJzgcECeEkb3&3ax0KFAkdWHIY!$vfyl7XB_ z$`wiObzHu7+$r)B6%lI~rIk;2i~D}9;_B&u^~sTjFas`5KJ~*ORv$w+U0v;@&FXal zOME1-hklGSg_3Nj>g8_I=JVDpB>ED=*u21^z1JN2@MRN)CYlmZe_ zo^JVZeuMi#x%~$@Z$d3Q^P-vk@1dvxq>5``1GW0ZMza6x(4{mU*n%=WKh>yL=5{p! z;_v>$QiompyaLM;+yqOk{iUwnWmXoJfl3|SU}oOxd#l%DG_QYtn)kI;9vMU%()PiB z*U14x%BQ7gK$mW zxm~#;4gWA7o!0cPr+z2%_)g9;<-o?cM=`ayPl&N!gH3eby6E@{L`uLosx4st?J+7< zm#KcFYhVo1fr8l9L*dL&rY+?+?l=b)K;OhEL)9w>&SY*?T_Wg0q`$~SEWDNUc&D){ z9N@XF=EG^v42=S9U%y&%mo?-J?y9l}Ru)giM_b1xCk=nQY3-@|IP1hjm35Lxf;S62S6#Y{W(>} z`0GTQgr7~x(WjpD-k>>C~}Y_jsM&}}!<5jCe%fs)kc7k{N|q%*z9((Scj z84@l_o?#jNU_4V8S=Y=-eHp?U{|XfYiWrDbskD_D(R0>(oC!DK^_NkW1g=%;$tas7 zs_Ly2bSPo&SCx494~ZyOVr-}kq-DwxGx;K(1N=@B{7g7dSJW&@bjXbTH~-G<6u>X% zv`3E*CN}Qm>W8<&$w4-EMIZRHfnT_ApjFLm&?z*p#Hqv5kOv1_Yf(Y$VBRc4-J7~; z^Yt?8ug|1DPb&28uINWyvZbqGhUJ^5WCYoe(i6z(6!z!tuzd*ooKi57{*{f&sff3{_G^+n?9R$)l;kb3?3a;Ih!|9$szX8XTYG4X>oy z)2Z;Fjpt%&v6!v;QJJ6V@Z=4TjM@!9YrpkJ3vz{J0&AB3orrmSZ`|p|%-QD?tQZCW z|7z1=zf_7&#`=%)jMO$Qe1Mq=rD?pVqT|bRmDCy~QheKg{2tVrS;;A|uh>R(ucOy3xOW_JWQR+|756mlo%53akEU`RT$b2$FeH7xWBzv< znWHyt!L_*w$UphSdwCBk)GR-rUR#Yk{2HIJatjqsrtcjKIe+^a2U#>Cjz2f!=ZRPG zIK#6(qX0ri2^I*vG;1&!(+v66bf!g7$IV_+x$|-&B{m@EfL{ z*$w%NIOb`qAz_@;xPrZ{=9>2BZl=;01I2oVH(*}**(#t0`Gtepu$`t*b#_QlY6+_Z z|4@?#kj630KVg6RR>Y)yzu(qe&!xJGUNxI@K09HlxG&b&?C({cZEi$RSiKt9zAEDNJrPX`<@_oN5L2j7Q}v>OXoKAFyf0Jal?Bbr zYT$k<)CWCib;El@2InNpwC81P@E!8kUWVm#>}NZVi*{MkuQ$$rrOC6i)f_O)Z>p_| zobq|HZvVs+956_bAAzp5@Md4C>3d!&F8Rw#$;i`@Z_t-bWB&_Z{>KQf$#+7AOJ|M>UophLyY;T}A-fGEyaX1MWoyUhCJ`!_sp>GY7B>4@+ z0E()!;lI>$z50VKap*UuXn>N+sY;T+Gx9mZjWc&E@$Ae?Qzbyw+Bv(Eo!Mz+^gRPz z-f9Wp-sAIu`q0$(fXOa=`vmVVsh>;xi}zdc+{&mrJVv8=TPiL?z)d#%Yy_Fzmev$`f?=+^E>w!0JQGVz+K)*f*qp>k_;J%Y(MW!6#b)f+(fh8#*iqo#Q>;?;Ew4 z(n4DgSKqI-^#43iU3&!=3@_~7)oa-4zf}C7p-wsi%~{8dwq8mUiO!tBTlcV-l)W-d zAqo*J-kzP4(Koa|VF}Akyt45_qT+h}dI|?mnUSy&cBR!J(c_3N@7{yZN2IoT)b{>| zxAV92{F`^z2r0jL2{GLhdc1@SR;4K(ysM5Kf3bsEC9*Rgl=sJHm!A6;3mE}OTDyh6 zUncJ01vZjN+V7W?QbRhj4D+FFK}}Tw7oLYCe5{SA*Luck6uNoqHk1H1N%8kOiHp}q zKRYgodUE^208k_k(2)(@`Tz2*wO@swFqi>kW~MCvPcyEqP5(t0Xvd=R53`!p-_&a? zn=??RDl9o=x;8%VjvQY=u2uj{JroXO(*Qm1H3Kq z_-APHUqgL|{M5B#rSw=#?1UH4Q2KC6_<=OJT*KvJ8{Y}bTyV9^_|2AgcC9%{B@L@( zU3sp5Uv@H0cbr4Gw%0U{mf$hy0}c-+O$6ehCvu@LV>4~KahdH{eDiev-xGRr%?Rgn z@7@($!}KYxwRX+}fDhWl!r1@&)=0y8u-Nl?omA$(+WfoW{G?3OFqq>})w*Iy%h4VV z*Y+1(Vz|!81ON84jJ3G}-_cE-KQ2gms`xvI<%CS}xO{bQ*r`RFoUDdt+W-s$C1_v87FI7h01*~wV^-Io$m*YdWImdVHM`;+&M zj|Z!{^v`bKs-<*#f!+uN6O$jz1^<^~m9i~oVgBS_mW^Cf%=CZvmzY~D_o3eccMV#z z@Nlve`sWBUr}1*{-v@r*bM9)?|A$}`Kx~zykyHS3NV@ux5!43o?e8saM!EVZXM~SL z4;+h@D;;4xIqN^iuqM?CDQpp8VNV|JoBI6Gu@s}BJ;#!>8z&dRUG6QrImyeH{r4>i zjKYG9PDrz9= zpC1#KQmH(gwzx-hc7-3CAP}q1VIHgBOlybtURCM-w*z)Amdp93iX~fLOk@6fJBITW zT(PIiuta`~v2)WOFTuFoF6{KiWLdE|LwP;X)U9aiz%)j*Cp@e>^>cgCRCj3d=$>Xt ze37M7e&*2Loe~|^Tfpf5y}d_p882U|KKI`9{%{(_{AIN;W1GoC&wbRi8|DFPl zv*S8XP_?o=&v1&Mm2$ej_Mc^S@~P^g?;KeqZAGLcWflaijAa~_w>Xmd{y4=M4v-nU zmozxZUj-;-=f|U0#$*2JJn-3_eXY3KXgKZJ-WSL3*80&KE?wJyT+RDt2Qe(c-uWs4 zDL*eH_Qw%%xQ|hjq&5fP0;xQGD}m*op9FlS(0>!NxVI%x^yuaMdug5cKW_J)Nn8Jr zTrH*7yNmS@ENA`qs<@Q3T~p~iTVK2SKhE#g9?Jsk@FwdciQ|ow`9JQ@D=`~i-Jg1_ zc)S5Dfd6g}^W`c>{D<%^M-LPt|7ZxCp62{U^0@4eUEQzO=1q^ijvoBm32@xyAAQvX z!tsSa_L(qe7;OQ}H2=F#s>C~l`uM-=0BH9b-SWuZkAs#!JNbX>yEAlo$XlnR$yp?G zl(kw)xThOdR<>2&_*i7Yy*m2cR!^h#g|$l4vc5BHKVqDl1jW)Rxy!bqf#Y*F)l-RV zPiHPvHz%<5TwN%sR!BS%Hh*u|uIrV0dX?0h`rlo~o7Mtvf-ya=9kD#jc9vWo_P#$X z@xbnLMYUfNTj@Nf;TFCp1^xNzOmg*+u*gK?i0U`-y1bD*^bhI--4dORPpJ(~Z<5(c zgC|Xv0~^av(GKt3w-ge_o+;Yi9!nf_dM@^_q%pb0WwJ~x$^ggtq`H|e_g4E^E(g3U z9?^vKDe)X0yAidKmkWzvzP#>gulNHg6JRM`+#I?yQzN^njXf+^0G z#+*W}g9maN-R{I+s(kN!x&e;?v-CZLxt)^rJ$-O(U57DuLI38j!Qrxo=Zxu%yDf8e zmft^GcrJ@x|8hKya_;iBW_=$M(&0*2*=h6sdEB~65MGn;fAr1g78iCTcE>)xzbHjO z)%rYyxurP8dIZ@aQkKH<0=hn%dL@WS1qA);Pe;y|3j~$rb6Xv9bJqzfH-#e?BDTK9 z?lhrgfqJWtAe9yL^+5n$v4&k3(vFA1>DifM&LPS?+1+M&RsU%OXh|+#u3pPunk`@8 zN4?I?dM$3JZ-p!F^B|JQql~Fs7Ho@-yLo)ZP=>!>ysq1~(}exKQP+g2$y_$cUxkGI zbdt~Z+i5vAr?GM^dt)-I{E+*yfP96PM(QY=!R~JX5yx)^EA_|#G@?6L$J-}fN@pPi z4)kU?Hy%se7bSgsqTE5VplEHm;JUA;Nad>KV-WJuvl`8|xeh06rAraZ@F7ad zC&;qySNpU4P;Id;|2}r4V|`m3q#8IauE6etko)~DgPSJqx_PMmk;=Zbqm_>W16?Px zO+CMvu9B?FIN_@(mObMaVmX5m3&$@t>D-(? zV7FEO6x|kif4U`VvJG^WNlmj7nMkfjFk%?82w^rr9E!1Dxmfn^Qn1bl=nm?}UoJX(D(*}hc~!t- zJeMf_>jA;qfxxD-_axrWTJ#*>*ha&B0yl+S;^7uyx;X0`sF1xBZ4&P{oa-j_YiXO< zV%qYqn@qLy{K7@B&=-wH4aBi&{HhCyDYjk=4Q*U+oAsCcC4h2&gK{~PSZeQUX9&+D zj5c&NzG)uJ^-nvq#?HWGc+>yuB*o0cr36(Z1JhkcMtrRJP-ac;C3`CHzU4AIDDIhP zNBfM-FV4}jd9dTlraOLNVW3#fLB+(f)Xa9xUx98)NaKf0*N77SgsV~3y6eB8+f&FX zyH6o)$VjqN5|JE5*sFzwROnic0 zOmSt;Dy>b$l|iKCXg%k?UmPIHku$Mnw_65?QJ-Pwg8vmua>3oy)YlCsquV3<)C_~V z7y@N>WeemG0@U-3;zv3Jvg)-d-g&+yJ_gGNJ-aq+@6YA}*^BY&| zDTE-D*f*AvbPz)NqdB(~gnhva#_MuXqN;(zL!cH=sLX~WwXgo{!d_Qh+cXd@gn9;(#|Fs{Qg z`1W9Du%Phu{cDd94-elc&&-*E65Du5_UhE~(D`;(f&o1xj#To5q3RdA{B*XAt;f*0kc%V{9C$SxqTqUH3;yBw;n^q*_l_3#+T>vq=t{QY zj!yJ5WHd<@8|U=ccf?zOU7wbL7ca}h97N^`dANomx%)?)Ro8o>>_6lqu?mxg(Rj#O zE8S6uIRr8)cik%);qE z=bNI9*&U?p&B=5f91f=u2W?}h-%p)`@jue*DjzuTd*9m6i6^*aZZHwT?z{tny-(eJ zUGn-hMp*ciZY+4c zf8DRbts?Y^f5G~`f@=p{_&aH7vwgz1CsZjSKxs-kTmQ_Vj^FzbJ8hos_%2phJMy!v zEz_q{`17;jeqXG26VN^tRPvai*zTVWO8!=1uL8Z#N${^0r4X`j*y*-D9~qn3eh6yB zNYN%7FaQXcvd(ij-G{fD1!fs?csiVFM%$v0M~0Seeinoa<@6cwcr{<-HHmy++78r* zN)bcMywZT+b)QbruGT^hE97gn%Cpx(ASvxwA_wtekb5-Q!pu7M3dy~efBua#7r4D8 zY=@F~sgpZS?gc!|wKBVb`$s;gsHlaRN35voPl08& zixMubpK65K=t&D!w=paw`#v--p)ZksZ+7YJYpgGxCeN6RbL9sXcGTNU%*NDq36CZ{ zX1f1HBY^=%-)+N@!(exQw~D=dDbprMmwG)#3ENV1$nk^uElG&9@|G!3wXtIiSuI87 z2VhK;r|%`DjCu|gJ?UYP;IGu@hWWv10@M2XUP8=dP)EzP(g6 z@Us2aQ@^U2zJ98QH8$HTs5c-Zvk%fSoO8qW%8P_l#Jmz9U)e(`jyXu0*?Ce9H4HWZ zyI!(BC4bCZQI=tT0vfjP@l(BF;5 zk>A<*($BQhs%H1ABDL>|;$I4})Z~9^oOu%vP@g3)_^&ZXkO@P8j*UiHRFU@&ic-f_ zBvo1QaQ-FFb|B@~3k*h5!7K%wiIN-6O!yRAH)zgjF3u0dp3Uh6-bsctt9kecHqClP z^$A5BEocH1l5Q`YLh&`hfd+e;8krGLDM5#JnNX2I3^J?)BEQrd$HnmU!nrYh2+59~ zgwQNq%L_P0CldI+$%SYo?7Frv?&aOS(^fTUG9BjH_G${xn>xKGS`WI-8~e4LG=1ys z@RJbc%(4_I2hD+g$bERJQ>v+%2NOfMZwUnet~3Nd)g|cfCc~B{4u0pLmmoe6q{YM8 z%LM3z#PCAc`3YW6cIH;v1aNK{T$;L&mKQCRel>~_A3#1^Oq{hOk7q&Gm2RHhW=#a6)w@5r6s=-F$iOwfhfzd2q%Al z2CV5#aa~~?BQ-_fXFclcA*sre_MT7QBl>w-&$9RitlTwB^JnsfAQ%Q1jfPmlnGE?} z^WXT`XYZ5GLic?U6u>bpcX~?DxM>cJ{L`wX~OXEA#Kg>P)?vyt#Y3>k0~z$ zp=pUM*)nBzmy2_XFqq$Z%a*#Ao)_i9*nZW6dY@i+#hzJA4A#P?;#eh|O?v5i6zs~i zMZh4MB(7iN&DPbFBPVP{?$j2LKq=|m$!1uiJic^3eHui>V|nTspF4hl{^i+YS-+D~ z01lSRkSh&hsRoR#buri zAH?q0_Ztvyv(}iT#LT4gy@Cve9k55IkUgg!!uVIKjzAQhXFSt%(X&9Mpa0Zt_RhA8 z90vrNKm`M&f7OxON2cq_(=hdV`|qOR72Uoi+^%*LMVxd%Nx$-L;^mk{er&?t>i2%% ziCQ^t@CV};nctr;;56Xu?c2WqJQ+MSLvL~H_V#HpS{>He-XU-^RVwJS>iXTsrS&OF z53Gd{PpAiIK+6!-oQ(veH1Hv5!4V$3wNGPJ*{J&`F8MWbBme`d~hyAplJU`s7eV^@1%VI$ zN-<+bPSh4WbsA{wCgyq=f8RIF z({&jzY4?wYhTJv^Ky|c=#9CC;I{j!F!Gvw|5|z}b?R#kcCle`$N=m!4vxR|_O5B}w zt#!s@M%GNFp0<6G+{=Qa+9-*)>IXdZHvK$xC^y9Z!dBvAj2_|&Po)i&C=mZ}Y}reP zKr}EhBBkM3y3aB8>;%?PRet*k5j-<0!24SgKbC=4P(8r7e{9wd0zC-%cqy?5xd`_Y zwjf^Io4vE#$5INsnfi22)E?IZJq%6*k1U#_t&%6bl$zm!@PwMS&~=h*Dz!gVjBqT- zR0ffm@@aQBrR@Yz*V-rBczTd>50Hn4SmKRN>Yv`&y9DKo*-2T3IBY3_nCv3HajxgE zfB%`lpwWNgKY(Zr(|7~p@0jDM(UtF{8w^UEg)oEOZ25lA(D?Tp;G3~vEOfo4eaR*& z!1KzmYt6qx`J@d`s_pYce1t}noj-}lE0+|0xHj&SaX7y$!#uJ1KkYCrb8?I@ROq&o xuV&nl_}2yapZ5Ouihsq<|F3UhTVcmk*@=w$ckbx511|!n&{ESyl&T;D{|~0Ds)zsp literal 0 HcmV?d00001 diff --git a/docs/sft_feature3.png b/docs/sft_feature3.png new file mode 100644 index 0000000000000000000000000000000000000000..029a4639c463944200dec856108af8520a291362 GIT binary patch literal 27276 zcmd43c|4SF_%>Wg_N7#lY@w1RV>cShH9v>GE@X_S`qVtb6{3kL4_uCFSsi?FfISstL>~%7;rg z_FoG<$@c8bdEFOpPK1hG4z=6cqV~kD{mIL_1hq`E=kH?j{BQM|&C#Bn&ClM*viqig zS$lt`_-9fAlk$IAI_CT`>3wjjlTVvfG>rHE_y6c7{3!haUBq9Kj3*`}By?)T%Go8N zjS<&dClt<}opZNjm5&K3wz`r!QGB*YfF-sNW`8o^eKYgKMB zE05MoS5gBQSE1!d)dwZ*X)R@SZi2eCV+pbUTxC<|D-K4EcMTbaJqc)(6#wU{hDS>m z?S8A%yEIE^Bg_t6E2$0x8N?1I_t81?_;_!PkAt&;PVw=wSrWv3RUW+S&x|iBq|FC9 zm6%rBCXH&ktCzO7bhhYe4vm*jqdq^+aN(H!D#{6S*S*;0QKFX;XMg33)LZw94tu-( zFR3nltAb-@rtu343MS@#>F<9S6rQ)h2JmZ?{EE0Fh*5~^@@y)ZXZ*vDu`Q?#ewAm6OADSOIQGHU74A6nM1CVzALJKec|JR?YEG}5seF0RkBSwV z%304cjJ3QZ!Abwm37wnw3wk{I^;Vwg zz)-EzGqtsPH!ZA@Eb*bM?e*^K9qpe~;j3K2eWN?yt+Fh?RSqq--=Q4mKWj=rjq@JB zsETEYq7ANJzusPSnF*`@`N66nZd~pD;5V1)?j$3*ngb~1)f$iFJj>i?=aMgr{kM+i zikVDbKkz77nCw7|kYiR;%dmGg)_JNwX>0~s+(|c`Ii0R1;C7RuaM@0c_{ZzdBa+R7 zVkIr#r7YPgjPwe1Dm(VPop{r9gQDQO8xCX7jSt@{<|{$AUb?75u%*Pe*zgs4((Zch zfBMjREL0vL494@dQ!{hbIRm2Td4!;w(`bHvyzs)$X*~T|dUv_wJAyr}*$;#k7^|N{ zbJd+>O!e#E=b$hwE6&g5Zst`Y;{IOb_QXV^*=O_1JE9?2L{FuGaLM44U59W*9+r05 z;GQGDX@cAK=;Fo-f_oG1d<*}>X#C^!LTB5w-Sv@y-62HJza&c`kuo?^drIKfV z=i$BiH^vWTmb1?-iUQ?eR?4U2H&#vJiPJZ*Z4>hEU0H<)0wk$ z)bNghSmml5OV=46@@pnU4p1?EIW|jv2tNZbU%fJB)cA>jclZ?~D~4OOzKPsK&7>Am zUDr;Svq?0xS?k{>z%?D~asH#;QnmmIvdG&e-Vx5#C6ZBpd$;Km z$E$i@afRcapF~%6X1pG%O2mwORyuYlvuy0N1T=>+&6&_)*vY-&)=d3JqQjO*h&4fa zA|&86E(F>zjNB{8RaJkn=!v)A88-=^`|0AY$w80})3qKx&COjr{3&1W*2dh@^PYHvnw(EtIq5U-l&5vKNn=NJ$nnR&_cEjAm(N&T6E}ytgY;8aMwJfrxMU z1LA%sL73Ef}9A z6Wp)<#lv?HSH{T6S(hoLu46GO8Z`1pBa`?)Il#&UvZRIFSVP?8K+Qf##q5?C<2O<+ zEG+eTNG5eotuU+lPI47>k2#lngh^em<;u_?sDX_Y7&`>kY?}S$*KuMBw7!X=Z_t9qwAr)rJd5nc2PRb$2sh!U+DB z36JG1j*e}UGT8Lrb?@JbcHX?|DtiW#KYkI(Uk$0{-eJW()>;l-Dk`rZ7+KZnRklf! zKpe?^{wASiw}xHvxqjWamKq9?%{FlJz1|)}(yAuCNm(V~kzCcx+>czXFBnZQn@_tX zgX-DR{wtSFRJS&6?@S#63Bxdz``5o%hLU#bjmXkS5p&K$w1$l_l)_aL&Uod})sR~>%ziQGuN1NC{=EIQ*z>Ldq~7Rc|BTGdP=d;IGiiozsz zUv=URptk$j>>OWUwNcyym+c&5u@_Oo15fQ7&(Qm2ACSRf1@KQX8y%@U5y4?w z@mB%kWWkG0vE81)W=`7;5C=Tv@4( zTT@e`L;Er5AzsEvH*SJq15f=jl3kFs6Ma)%^w5)pgu@v%^Q8i`_VjBfu&&{}V~6EN zYbtL}J*LmjHSjitHL=ld*GF3-+et1^80tk65AB$?uwJt*o3FZPjO2tzxO&-(CJ61A zjQ~? zSb6YVNGZYn#QJv`ye;T%hvw?o%o>Yh+E3!~Y%+fyFaG9D!*NFNDw1!lousqTk8ZQo z?p=87-5_8i(l3~_62_~7NEiKM9;+yVTcYSthx=&%UGDshd$j=`HSx*uR=CeCg67Kg z^to)lDoxz3Iic4Nt3(Wwk5%x_hd}E`8+Pig7?H^hgIKau)%g{@ln8^FVpPNrZZ*C- zHI-fE2%B!AN$@~5^JbQG7)$Zbvn)6+uGee&-u9O0T=`_7Xjt<3=GIK+kkUYl(aH7W z`Nhr9R{X6ZEet?RGzm^sTvMjAY93H+a6-ye@42-U#_PB*x8ss-XMsu2;r27uLEd(i z@sI^AE27KP^Xaz4{v5;ZJBT2$md+7bPSGzdVD~P%J`j+%~B)B42GT( z={D=m{w6ndrDKK}+ZOf9JoGU1SaB^ccX)aISt{F_?HY8=aqV=qh%B1V+R8^(W}How z#snU$6!QVs=W>ijXNW)W& zGH=@)ScMs3^+n_M0GGD3o%=l>FaaZk&s1r>dM2i zU*_zwg0PUpGuyKyq0<^?d#u{=qrsbN*zEu;I%)K$xqy4~xnyN)79VBIURAs}m2ZuU z&Jn<&xyPg~-CC=EU(#|IuMI`Zl6oOF_wwTH>t9vi7({yPrD!4KN39=1n}OeZh7ID+ z-Neke^i&=eQ$y~PzE6qz%iJ~No*q@_{O|0X*pO|K8(SrB8wSRW>^KJ}fMR*qMWO1s=b z2q&D~TCd7U1%R;BJEBI=kk{$0Jgj|Ebx}w^`UX-OI=}U+l>h127@J8-@u|pjUNX=c zs$h4%g)wEr=N}6XntQbUh2v|0Q~9Tx zhD;21hl2I8-2`jJI&Y!$Rb;!2IGJ*R-F^dWuvwe(k6%VLkrY_)5it|-LEqo+O;pm@ z?XRQ|FRq|yZ_g!bKVc<-xmXjyE02Zx?C%ukTnkrrNV{BcfL|ix?fB1n<7$fRnY>!o zD?5ocP2WdMqV%49)M+8}*B08x7yJ$m4qTO?Z0o)#>;fqk*4*HvNFzeZjaG^IxhkW- zCK@N?Ylf2Ujc_Fk8E$UHOK+8>pxlW4qn9*RHbpY-Ow*(2Nehfyce697b?dv&7=AI@ zZQ_N`)7Rm2jo!6QElchU2ijJ@r@O)CwyxaFGN_~onCr!$wwj1et*YpFw>Id`VqboS z!WWS%6#r+*P67^s!S3zVDvV1JdnJl!#BtZSpyIw$?{?u_`-uz#oYcEetGb2yy6XeP zaM3WB8m2DpALteP-{WuuueKezl>ZMXNAQkKAouA{Mg2FX&OiUb2qLUo{>Ee0*zo@e z&f|(u)|08Lv=WH+6Um%!|T67p6|0Fqhl3P4K5q5H}HvtT)upHeW>l# z{1txtl8uvM`%?6@2B7xWTm0NvLV4LDjNP-JfBvwFI`ITKcDNwnTJrJ;_kPhsoVg=c z_#c`s%>Bhn{vv5EnI$_@BgCqG@))8fQQ=qd2r=YD!&Bpm>lZD0{o%Y}m*g;h()K^> zYs#6?V+9$A&FT#Iolt%vLTaW!RyvAMez ziD6*j(niDUX1VFK#5T{#^gPS*cX^{9qFZEAoSuRGr5gtRy)x)$nDiF?PUQ1{1-*av?Uaj-5GANfm=h4J!Tx{Uc{c~s|=WY)zDE-?T zd!3S{f#rdMQ9rwEr~c}U;J~>EWqL3~Cc)`h;+gwN^BLpxK;Qetg0p`X&G)|SfHLt- zc6KU38Tt6wdGC|*;8=_Om(8Aq;Eq`AW`DS>r08ZLvB%Z0N;lmJW6ywVuB%oJ%b`Rj<`y;0 z5QU^ofF&P&#w1F6ysxMNn-Xs9KAAUA;4EPL@rtj6#z>@WOzW zj4Nnp|K%tqI=5)%xk2AWPd6T%C~;bT!PoV@R!g;QAquP$$c}6S>z$^|CTG2gmX-_g zTYBcd{-!cGd$p>+v_!UWpP}@4W#nKbj5rL3clrL87OX<7ix64X}H`zw2^~GgQ52BC4edQ~^%PF7j z0o)Y9e*&+ueU+nJeh?5;5>^?)rWa8sSUuR0$2ML1A&5|ClIYq&`T_7N$-dgaOg-1n z)*CwN3uuf&BpSk*$d^0PYFvWIO3S&hSptlxOhHHbaR(I8{HdZ9g3;(2YuklQ!q`U`N6_BFkKeH%F$~MvhWueckrh7cliMGT zCwgM^D_35@4Z6lI7!-zBVAXv$>}tx#{t7z`ASrLa`Kn29qBlZ96Zt+8@Qo=;@N!RD z|2+@ek)f%L=5F!b5Sz~1N5sEEMnlvu1nk}B-e%c8EF2<0J`CGa7HVH(oEwKF<+*Ck zdB$>=d)rcCg)YN)-pOy#uVuqmJ35jGQ9lOUynl6{N8iEN{0pDH_dQD&Bh1H z!c6MS$OqP7;03=N@}W2A;CvX^U&epoE|PL5u{mAF#}A-8B=Ip&uM(K%joui%C~gN|x*o(4jm9)2BCJO}pz%*eGu zovAdu=I572$THlsYtpNir>l=eKUyLSud&m`=@9xc;Rt7P2UWp`Tc$o;2UM-e)Yzrb z^;6rJGQ()!;w|E4nKLd?t7vN6|-m@M3_Y+*0=xp1&FOd~l0*tZ)vR)Xlt0SDY3 zm)F-*1E3VONSmg-a$0-keBGg)HXK5?&FX)QHMBnK7u-m8b#_D1>_&F0W;5S2u%YIu zW-Ew@spBDQku%B8V297D*FY#zF<6e{0O_GPQ};1BRZ3YJKCQ3-V({;>n3~AHx7EZx!tCqp#2t8Fl%f zHv7@m3H9vTAy-ll=j3DuO+OW3U$w8^W69-MUqxP`mRlk(u5^<4kbD^S!mxZBH6x6i z>BAp+Ro5>LXoqtI&9I4vJrT^d=I9qeM9cujs>u?W*1iBpBnXD1FdWiwzuI*7P~;?p z*+!=ya0kitEF|~~qc)#A?L=x&#bHaHD*OSt?muTIm-Qa{92DfH)DKhb*I1fua*aq? zNTF(GOYU@h+`l7D+6ed@!5mPq{-*k z#K8`bBS2^#9Z5RV&)gg0l(Bn6b4jXyubtitA#D`rz`x=ZSQp{U56+*Hg#1MUH#9cjZpLb z_(nPN!}vyd^ON{S1#=~QqoTPkzER2i2EI|*+z#KUvitfb)zE*Zs@H#&K@#An%h{s)vh|Yb} zPQj~TmlWE)-Q5yJlPB*%K2XWAkJbHN=<(9dXbWF%&SLY`6^)Uahz(c&Om&08T3_fL zr`^*L_Bf*(H1?&EGgn5x3K{<79O6VXwZ*fPL*??Bed%@j<~2uvHYwYZeS=)+&lUl5K4WVjVIhI*ueg@6_(H5Q$AKPqU5!_%drR_O0(r@S2V>AO=4P@Ay%VH>_gWKMnA}Es)4UQt^LIl^S(M@M z{Ue?Mht}7ZW(*3sDJhgP!?vVa$F6%?YpbL3y$q|CiOl|KdF=2yXVM%}BH&;U`wmy| zJ`uD4DWA&Te<+FE0hpw(z^Zf8zua<6hu;M`dW^gF!PXyix7$o5Ie$*?%&_uTnsbNd zt5Py8nttTnsjt*DD3+OPI3o577Q%Zm*Q&2=r+0Hx!enFgfrz>0Nabl&{UcBpH-Q1- zWu8cedyL|Wi;f8H$*qf`veG5#JanOez4MQ}NLTxXZ!L!L=4v~^ML8r0_%pa4C4s~0 zGk0)UH}g#yO4^XQ;=?DSwL&vb9gj`Sl>Z4S?S8u~xd)LWR58dWDN(ICc zI{`j5wKZg=EX;U*u4Lj_(;x1+YS7xh{L*|M%$=lswym@tnhc`+K=8!N=)ZRff1-{! z5oX9;au|C3=TBs-fWJdJpb>((V**_f^BAqxH#s>eF~a7giSP?a;vbmd^)sY>6_8r? zo&CO{oqAMWb0&%UTB^(KAwKO1$n#-9shR#(KX^YBqTcafOx?xP?-06Ff@da3Y3`Y9 ze)}nk>cfJL%Y=BX+#qEyUrG|fGNoP0Ry2-Xg16x`+O(orPBVX4?P13^%Q=j*tC~s4 z^~yMh346#nUz@neGufOq>dMcwDp?7@#p~Z_0$btg=HK}A{T-0AJ3NQ%UiN;xFep4z5B+&7x}?$kQ?4kllhdznabZzvr#tz}7eN01 zuSMq;>%sdNSTgzF)!WanX&U{H*$S9{f#n2V;@_wIKdTT3H|788%q8PDS5K=_qizQA zUyp=H{&qKFJ;?w4x-ssz=aYy0nu?0wakSmjhW}&#=sp45!#Y>cF-iOF^t7anY>EAs z7+g%-tLRITOCuj@<6vyXn!){CbUhTH%gS*KevRi!b`YxOs*X3NWYYD@5y zNh3Q`D8_znDy|JmBU0K5<`SLFWchSr3qd@cb8k^>`>nAI zc8;NuDAwR5#dj{9`=oH>eUkUKfP%-rj>O2jBlTSU2N4O8Y3o3*zWn>2 z{FrR5e~#5Fifif=pk~YOT6%4LAR{sYg#@4kuHq%TF_83Q#=7YSYtRY~OY2EB#(V@1 z6b=_f#0$Fv$%_Gy#6%OFW|iN*&%0IiEMArmMB|anF&IqbwE?G|p};SbdqGTPOJkFQ z!YX%B8_|w^sdGm8i|_;>o>0w)y3F&`_xH7YiJf$1dHY1EvF2s_4mfpb$JN;v9qOLj z5anwEo9{K_7qh>8B-%yhntl%(1#2lGM0_^gaITi#)=#x2B}=SxOp9D2yl@7BC1W=* z`w8eXeE9nDjZM64(+uyef?HiyKlw7t$fWPL<<^p9jE9CkdL?-k8pw>d8JzyKGj3EB z)chqj@}%0?7DSBq*LP2Ye2X=rjVMx!Ycz+=tvWLzcGG?=<>sxilSq&z$GCC!MOLrY z&-MXQ#wYLYw#qTo#z?Zt`e?MOdU5-2kuIWD7tDR>Bf8&yyNy=HbR#V3@NK5D@IcZ% zn7{}#55AnVpk8jg&ij|dVU zY3Fl3_oGJjzUaqGB-Gme0Fmthj&sj&p@j`h$au&D*}`xQYmVEE91-{(=u8^>$m(vm zMiKJekC)#gy=iKLZkb>5tyW7G6n6B5R_N=DKb2~ekX319BLzHBke}ewpHKc;^KG^| zMerO2rCXI@wlFcUQc+y8!!^LLVi$fK4~KlR)`PJk!Xj&4cCX3M_Xq98ErlY+IjEWy z2SyG6>cv_S>MybAgZ$)k4c=u;#C!t_`~r8$v-=estKQo2T}|jI`gXr>*=lWVN6eO4 zavgZ4T#7IiZwIQdR@7PeyP+_w zSKus8oDrWwFUJxj*#yAq#0mkb1dEJ;JUAZAjSIEHTiIpLg}0FtKKT2%>)dW_g9D5N z{)v?_$@_ron*?5VY#T39y+tA&<8GRF zhdJ17DyiufY=&O4Qeb zpFYe@(kJ9_pb!|yR9;2r&LMxMOE(RStnd#GCxC;c!;-i9`KKJr+C4R@Ty1w)tt>}B zI>LiB1Ind6t$bydOp?+;A=!oE41HARqCh5Kz}JmVn7TC`m!%-vE7bZ@Q5JVL`&6B~ z`^}YeG6$B%E>iBK-Yzzu+heFr2&FX4BU2#l#zU+kw9rP`StZXIVK?H^-hl5M5d{<= zrPwn|VqOxfjIq1TFj9Sef4tuBvU#6^yuQ-1ozi0KR|dNb|EC8%fgD>)U(|N6bBdWr zgq2!XCE|3_ZULNUAWJ}{_o+MXN8|wh#Q; zv>G0pXhF7u(xy}|Nn*xoWXO7{RqrJ%ha$)a!44rFQVrGg2;kD>R0WDH57;c5S;M=G zCrx=G3W7A4C3mFPPa+}ye~$bDFK@nuB`&^DFlUtF4heRAnx&kC$BAk-SsF+OMXK$Z z8X)W*9JdNv#Xm&mK-xoQ**hbSXe{a=f1x>J_#n~Cs|BUq_AvYZTaEwLZNLWYe_16i z$S&bu@9|ijxM8}cxW=<)wAS{aG!V&*`2|)c(pntaudRns`yXZBycxWGL_q@XU2hFg zQ9hd}ogU0;Z^PH$!g20VH(APuCxF0_A?sN(kE_oF>Egj+zRx{Er}`z{6moP)w>Ca5 ztwBQ>drmC7-M?hxiZ<&(HYxm6Q977}Bc2QJeZ}A>-_LDUJtM4zVGzIi z>gLctOwE&%JvPohl7iE~z_U#S7P4ol~1D8e@Fk0eD*?=$4V z%3ysPX_<{4I?+c=;LLkC9WyNA@K*U@ePCleyG|7krfi zPIiMq6^9TE;A$+AXB_!6Bqo;{qJ`lBvuhujnxjv^!)A{aKW>f|gFl~z6*ILcq0S6^ zv}1`E>WH@dwrIz4W0$|u<>i+2x|gcc4zi;;Ri{%hbn2=SJW4QaidPx^PR*|4l{2@E z*O_#kL7^J!C+RxFeqq&z325$VARk206R zUr;nx$6ruVuC~4wwM&IPIW{Dm8v$`i#=Ck|1QG#!D_l-zKS2*9c}G=LPQklg5|fp7&x9sWA0E-olBigJ2xr>+fuYaC}NEKf@n88&ALd4^=Im-eX>IV=u z7~csN)o>&C2=Y(>ZY^G3+e_!57edrM0+==Tt1_m4Y?iw-+!3VD zq6VVM=k@lluIyd<>AUSVJ@(X8OH_OZV~0E;WK=7=8O|;m7S-*8c_Qt`e~+=Z#=(o# zo2sAL>!+v~I;SqB^{4=Gh&=k1(Sw3EL&gnLq8Dsjp(*h|1sUpHS5;$O-xfnoeqrWW z)V-!&4Jn9FthVO1bL>Byv>FCZ)WIbA7BdT7ReGcnxReQHf0>RQXc_g#*PKaZqB{RzzGP|84&Egw+4lMB&wBXghz;bl70h&evqTN)ySoh z5@Q9-Sq6PeX=_&0oweWZTzuW8OVZB{Z#Nw}#baLI3~?wP<=@Y3`oPz!k!JY5HTT$-j(XU8I&BQT?0 zs)%`ZZ{c#N@3VB$-nDlCzjs)6I7iV&4EProWGQ?!S7U8y7Xje-DGTvjM#5Jp&ZYab zPkpyRu7!F=r)xUuLBh{?^wHFk)?@{STC7rbimc3M*z+uVi-j3A^0D2o2G}o+7>`Xa zoX0rbzbQ%KiCXfO?RNrH-qM^v{enD$lzj8%%^KG=KwKi;ExxWcx*{2WBO_w6G_q-l zMmXE?d0HoCdn7ARu_SQ*A>rX!fgae#XqsY9VSHw~=_*i!6u_eN+_-;@^bV<)rsJ-w zJ7C`(p}W_ud`iH6ON?hp1w85oU&kEXwx!P=cNH9ToN$?`c)wrII_TF{okX%0+y^ab z`R0Z1!bqha(IHw&+5PHprOFhP2f7ST1A-F&-z`8-!oK^UrjPXHtTqF{z2x|17Y|HS z>l5lr3uDS}EH+F%-b4ftIsT-I?tU^xU%1lmikGbW@$Wqe(l^_kG6x*ADxHeG0^0!_ zsZs-1@%|nIVrKaqh?#m2Gm{C<{%P`C!*gdVUZSH0H&+QulvU)U4P7iVM#TIcyTf6* zUq)=$|64mu%@1z2h!hF!3X2jXtySP%dlJ`@uXs z=|2o?zRc~^bnXR87<;Zd7gu;3o{U*Fu6^LqU9vT>9V#+X4rp(JM&;j2bx90_-0gzY zb6>1rPPyGcEk*HGuk_f6T`pe}Hxy*29F~~ZX_wT;!AieWF8t4=yiGGJ#2$_3I!4*} z5=7xWr;5P2PjgAsq@e%K+&EScL`k!-Ais&tnsC?Af#52@9g^NCY3?C)>(6_y(=ECY z_hr>586|{QyB|1Ntoe@v8CJ>vsRsXN{$Q77eKDl4=H`B7*Yi!~OaHaL3SP-FofRodzE9Y`V?vksq5!TFW~VL}2{&j1(XWq4)BwNBI7Z z-@4u^a>llqTS^qcgs=dAtavU8f2%t}KKvws_Qf8Ynw|RMRF~&`y_GLn-i{@)8IIW( z`}b90FfQ0b!z;dH3K6p}5BZ{vPyBNgZ|>gQg3FRd=3ie3-4I(qvg=M^!JG8m1Eb$Y zmE5}OZh#}@yq=d4(||1)*jT$l8N`_TF6?87{=GV#&9LgJs=H&3DUp6CrZz$;0xUqe?QX}gm&Tp@m)(dDrM5B$?8$Qs z!oTP2JQ-oM?R30YG)XzW{4Vu*`#y(XG&X|9qUIWb+qm-11+7OaLYDIrZ9toa6S|>| zIe__mdg&`(WKJ>pH?AK*P>KrVlU(2Mxi8Pb7Ter&{X!6Nh>G!>8tE%nTWy3pb=}6f z!U~P0C5(X9i$3>fJ!tJ!JBep=>{7>aOlK}mg_JK4lDvQ7YjWD`g$#k_%3%wykMXx% z5$ta~eAZGI+ZK@YKOYZ7PzsWHk`9B<(lOOsEXiqOry^6A;38Gs72~O&axQ&r2-^HW zu-|S0Z<|*pgb*RkiVGxgonS(r%HA$ z96Nc&Rl<(^L)mvqgVA#FLHAmB_Gmc5vR!}@{d+v9<8dUm8ZWY-=veOR^4X%RMajAQ zHn1_l{+~c_9QkZ`KXnS6ID8+UQ!It6hAgA+jdiOD;-e+dT@-h00l{9gt#gzSUM~!q zAIr=?hTQfAappL}?A<}N^#lE2pt~emB|AHNy*p*Lx~m(V`s`-gX5U+{R9vxOe##-= zC&nje?PhN;i86Nc2tbX%2(*Fq*$tOzA^$SPe0avpjv2O%o}7t0Vyulh1Bj#TZ##cw zw3ZyYbvMnTZz6SdBNe|z>)-tDAORf#Dy|g3XPz)SP9%qhD|EySd(lUGD%-6}#;#28 zF;);N@ysi+0xP^QacZ1S>Ie9fI1Ut4RLI^rUyrPvs&mQfP_1A|qY^+D5pZQQ>FOnx z{!1rtoG|4UKm+*=zncox9~9ieN;l6a=ex>^jLh6i12;L}l6e}?Aj#`HHL1JsM}8EY zrt*U)y8qpmVW`V*i8v!GwZ)&xz&$uR9n8ETXTJ+K6kXmB1*FDa(D&l0%kf6gh47oH z4t>C7Q609U%jPQw#fjJL_0xn+P?{U?a>2XkCGsk6ApvyF;2atgn!Y4? zzkP__&NH3EPq#)^#*S|H>I{vEP1*OY(3tM^zX=&QhgDdZpt!RnUb^zGE*iPte9vmV zD#?+$K($!gW4^DcuAqqf1K7694v;5swjlL0d@C;qYHjkrmr(PSFBdYMw8DpL&;y4< zPMy}!=*W~18Pzn&4@}DIew(x7uCb@t-}Iu#;z`=E+`lGLll^lRORUN$D9|~yJaIq% znjD@}(0`z-^z>PQ4T1HqX2k|D|(ymEsD}w!O}UpN|K!EbtBZ%6$gYP zGV2zAOv|@|i|~la#`$)VstEcuDm1rD&1z!bm7qLF|F*mI!xjq4nM zH9DnBu6~(#NMEIz9e@YxWtj5opsqfx#xq=_v6iK=t$GK=ZZe9KGZTg{pop}C~HbE~1V9eQ&#S0J~b zrP^a7p!?f%_LY72Jn5zn|Xe*HgJWl=kF( zoq$Rq#oeRc_QXI->B7gThq9_#plvP)=-;@dVv7zxTDI}_VXEd6ek4*?2!tf>k*XLD zv|s(v^X4_!KH!-ttSYp2WRyL6X{Zv;l6z{A?IarKH5htEf)VTgP8dMzP*8g1*yoY$ zoSKKn`)C46gw2nIZI`**0O7AnJn5NQzH8$;yXq--w!*M(($O^~dOie@1#AJF1&>wn zwMD7POWz&@jWK|F-ru)GEBq;kx9qG(Z7}iz5pmXN&BY^wstAZm#OHQ_|DKGa{nR65 z5AEES%hWqv#wa>p_;ZiWa_ZGLS{AXb6D^C3CLq+a(Ay7dSfj(R)?6dU#2A=t;>OX$ z?Las8Nv-&A^@3}5I_9>`4TMD;C@o*&8HEaF|E#{mM$3o8oJRz$6~4m~%#|6MPQKy8 z?6Ozix2rzSSz%}L^b9y;Q0~;&|KCtMlLze<{ol5HN!%SpR0{OW9mBA!*7cFBNG8<4 zHD2=rh3H{xjgyTTk}!Gp~i6xW#(CHD2kIa<~^gF441yOK?9B zL6AA9AYRwx)%Tm=*?`dJ*wL|$F-Bbqn4i2l`X)ib;nvPY&>$2VdtjykNUACh1L6MT zokzjph4>nzrXV-GuEeM0k0wU*N`se8FpSwP)TqI#VW%OpUL47U;e)Q$ZkzY*yn zHB!%kJ`S|IHh0Z7S1r>5+Pki+h~op7@jv6pKC8rzPx7aJ$n-arCJX-N(NI49f zcO8T*Lc)>ss0S<9M_BzqpXCXlXvy@rd;6kEQn|(6FjhsxspPH%dsj$M^JVn1wW^PX z`2ms81$-fhac3kCh6P=}QdBIVJ2w8M=y`#Rd(}SHgASN0vRRUCgR*qqtL$ZuOzQHG zia;25d|QMr86a4IF0Ts$8ijT~u_2heOqO&Tv0#sO7=Lb@UZa*)$smSU^x0H3@NIl@ zx`2cE_^4wp+9K(b#TuJ7S3g^HSR|YGSLq+pUexf{uWIZzmO*2ZZ>MmKKk3-X4Y ze>*C5ZV5DUz}A~QAM8X2!BrTINYGhY;J}f;p2&^gpq*Lk>{Cg9cSJvZAZs&C;ncIV zDmM|bxow;O9p!>)RljR=^RLcMbJ* z(Wj?-rlwvE)5(y{<7Oybfj-SP8NpF2tbVdVKQGW%lCSvi2C-HKPl*75o&<&qUAJV- z44wu%-Q=ge0A{ETgdq&oz)~NjcMW@?tZzkmu~?|kydYt~hp4=r9L(r43BX(tW+p1- z+h6=WN7~uUWr1wsp$q|lIb^8^tGGq@IaB4BI;qNI&`Yp4?LW>Prbto&Ez*ZQOSX(i zHsC;Kzd!zTk(~QcZTg9dv%Ppr>V%fCL_`)TUO*xpsMMBPbRy}aO}}x?rLK2~m9iu6 zKYxw(X(2W zYx@2j)ja!JUyYPizC}WIf`Z57KGp3B&3rxLo43s?$#ZfPqp!JEpz&zbjo4SB)1tPt zc{D~NE$5#ZtpLeb3LsLwI^$+avV%9j9%ZzgdEswji?hAnV!VPXuBay+1w-?SeY8f$ z5#2m+y&jfDh0+ulTwV~>WYIJ5H{*rL|e`-wQ-S6_j@5(Q?Hq%8Y~U~eO}oQ77KH_QxL?uK^=@f}#}^U4;3#|di3 zlmjL%=;hMrV}A^V?%g0Y8gH?t{UogMDOs)xkPF)@IxcANWFTrHE-(lME1dnxCj`+z zk^Rjk_S*JlY>z593@U1+l}SJ1wP8HId6g#l1`COocy~lH!3i{Tfo}`ExA72d2r+KKORc^Y94W7f7lxv*Rh&IYS?l+e zHv!5)tD1X%W7Hq>iZ=d;p+qz)>RKX`iHAJgsx0YAT#^WyR15JO6u%3l^Km=`h9z$i`n?p10@J)B` z?|9jN!L#jNYjGKe6>DtK{|c)Aulzo#{rQ8P6nXwYjyJiBbU@OxUNST!=V>4qeQqc^!0A@QTFq<&pqHj zFPuTPUcbVBmY;$m=`m9}u1IEy+YWL29c0H8CtSa%|Fu$R?OMHW$@SKs&Bw|gTZZk+ zl>sBgdAP9z-9V0fNKiPZ@OADbGz7=HvbR{CmsmzLoGtrc^#8p{jLCwt59!8fCqRL=K z_5HXx6fZdw^2d4w4C6}_43*00iKq8;FHG$oSwYc>Vm4hv|#6XXQ6-M{Z!1j-{tm%)0Nk&V_ zYV*%#g|abC1(^G-h)&=1-1WoQ(PRL&*k5bQm{cuZPSyW<7n{Wf$))jHlkuXO303%W2> z{`2;o4_?01$HNZO^J)$otxp{oasq#QANy3`=|d3)Yi9!vm6g*#{E(Mkd3*1E_QJ?r>mLT zs*wC@DVXjHcqoK#E3$H%d@3ka*nO3&tQ-)7>KmKS{HHcoJv66klT|mJ1U0pAUedvd z>p-xh7OWG|`N1dVBq2SKT%wQT*vgFoVwH~&7XwzG<`8N1M;Q7#};quUT?eD{j}@pf8LnJ>4rVjjc|} z%Hdi@o1io(sMD^MTg*Gcl)Hnz3g!$J38IB5W^AM!@x#=cYY=cCE?X@!vRAJm(&oqx z!PXdv{qY0SA9$1J6S6sTC#4dVA?Dw|UXE$uK3N_+^CR~$=y#tD^USDW%xYX+c1k>) zkslK;RM9tJVRtFl5c{B3l^>jd?gm`RbaACCzU|2_{7hW2@828!i1B{w3dG&~KX(U}Cqxk)qN_1>=XZ;--I zhxumFpACYRBnD3!lj9D0<&@5q*W#=lL8F~scjdF#+|{C3Khj)POpe$1L)4AUn4fcW zRiMPj7CpA~`D9}PJ37L~QzugIP7)zSA_Xu9Sn7@zy$N<4Av5Og;#`T0Gm@xuHJ8Ea zj0!#+lk;BPi!B~j^!#Yb}Pn z=df-1%kv`-Xa!%MeZkOKX+=NpOO`eHj;q!Azk2)5s3x~=U3IH~h_nD3MUW;Pq_+SX zdKZx*RlxvClOhB{QADZ~=`|4q6v40o1p)y?dJ6(dkh?Uq&69&LA#@pnA|MXpSMVu47`+4%Bs&Wx-h>1QC6@bW|1 z%jr}}52|2|shi}{RVf8_z%>TS7lbw&PjT0ri;q;f8=Qg7R(}G_X5V{#eh4~qpty<~ zOEQ+8K^lE0)9@!m-yiD{i~68x^Dm1s`)-eQ*H}uitR0)&pHAsNYU+pV`zf**>Z;}vO!ZXX zgNHK5Iw?DlMF|e11VsXMy8h`-2|hRfx;KB06asZw|+bkDMsgKiIku}XNkN= zN0N8y;y@%)C90ERh!L(gGZtx{s z**igdd80#x3%$j=^@L!Th+|HIt9vKtEVFc~@cx~;^Lg+-?A>|e^l2e!uzSQC?CLr@ z0@6I_0u!b)elsm33w|H*X6SEX;Q8F0H5*riXRi+`U_!R9p2(os*F+2$0kgrY*m9D4 z*D3-$s(z6wkX!d`N9ZPi`UTHcdAG&(L8`RXpM-#FJ{MxwoG3g8^~)fD3A*R0z=(R5 z(2Q(4aylTa7&xnz&1_))PH5r|O6m}Z399H2co8(vArK!#?hr@}V(Jt~3XJ$$kF{ZZowm2Sqn3t z$Fdf8d_Kxr*!pb9THN!Im9v2QJeIR~;PX+=!kt3J^z+D$d>R}bG1clTHOy?KkK9xA zw%1->=?LaTPx5XB6Alw_R7ra`yzOrh?~vH4`B{;#wbR-?|;iP z@P#|ySu=1&xc5q<0)}M!FvQVe@E_UE_tp&D5OKZI6#?dQeLcj{Q3!$G=zRD5Z=XOa zi^8ZR2o~NG0Rnzb`hQBFMotX_x&(lZ$9Ul@^{9Cbd)D3n#izE|< zSFI*^H7x)0`Gh2-S|e6h-~7&y-?V^&EM}^h2PsY#;zi1li+Pc%WFbDJ7P*)YX+##{ zN1Bt13mHs4MNhvi-rRxzi?%q#e_;Y*(TmUlF7g5r!~}X#sWStE4o`D?$%V1X!;zz< zTd6vaICERbIRTmymM;aoP2ZY-gbO*+TIAhIeqVm0wCKbdRkBs$jS-X z+wX4H2TY>lu07#Hw`96u?EqS}$EjX|o9CHn=WBcmcX|esCT%9*8fG)JZslr3fb zpxVePR__7abLX|(gKA*#I}OHlt7|7v1;m(#s8ZrNSJVe$j4KL5Jm-dLA;!3&dWh%T zQD2EM?kF59PbT2h2p{fbek*dBHfRjSE_H_jwMY!s&VI|S$~@da=ZvlY%WX4BY!>5> zWs%Evx*+{cBvn@5Zs>L&+%@B5I7TY zv`KJ?x2MiBV`n`N+PQ!MKcXPX!LpKZ?bGezo8R3igbLt>29Y4x=9xtsJHQ=J=G6Qx z^mC{ja0lg``~JZ!ol}3@!NCP05HjP3GMyubPd|_Xn?=0oUQJ;~$Tgcf*N`NP^QIrj zfKwyh46deJM3^_5x`s2LeM$p98=cp_QPoTdNbxCKac&5NzAnC56n^LN!reKgcNCYu zl|o{S$ya44nTccw4R5Z89?cAvP|~z2JxP{x-?y=|6c*)uWn?x$Al(Z&n3tOS{#WuV;WM=I;5a@-K!)->1w_Vy zz3=d=xvO<@?FX~S{-iTdcc$3-eHev^Xj4iLsz4vbmxt^Hxoos>aDl>Y)2hu5A>kL_ zBt?!>r6Dq%mCW1$OIKkj$V#yvFKj2uLFIFGOA0eNL#s0lyynk{> z6R{*V=Dug;C%%)CY%DL5E&{zcRNCX{Mk(*p>&aO+NI9h6P97i9W{-Arx}MowYbSlU zW8dOyW5JohN!+D^1R#5al0_8qeYng+YItK|%KzqMqd#J8t66yRO^VdL10fM=`FLg6zpSw9%wZH0%Q~@dgjf>BLxtN zJq;iI z>I6%voW%*NXA_O@UgsH-Gp%Z*1QvZKUS8AMU6N~DY_~sJ(@rC7hbPg7ZO){k7s4U%KJ*ip_~W*G};Kb^0Yxpj0sG6%#0tPf?JEN#iNEGaoH4lSbHS z#xt0xJCbv*mSrV1bw3~5Hh=qOI}Z0>IJEz^zUceS;8|VC(2v83xd3FQ>YVQ${kOR- zwKt!+`N%MPvY_&Ih2Qe{6NUbRBp2I{xP02P{xaiBCbD~zPZ*e31^L!KD@VRwjONu} zvo%$4T3#k=EZeJ2aD=`?gU>k~U1CU*0eX46yZ2h8=Q?RN8MTReF;aP z^?!DvPOg)9mtQ|Fae~{VZ7)gt8#^z2{v7a9b2bteEIAjqz*3F67^CewxM(e>`X=^F z=X>XNqSMpK`&qK_ReqO{3{0XEiX3K=2$k2qG`WgONB6WQIVSJ6SvYLM+^oOFKn+*= zt+%j!qkeZ=H)?&O9Re54axr~T6)v7zwAXb~NDTxN7S}ry;L^Hkm1-myGXQ3HSinsa zWf9~j+ApT#1rEg$Kde?drJg@{Gz3~XFkuZ%AKVO2d_=-dzpV{E)EQ)6?>DL2zZn8> zYo-=ux5ei+`Zwf%v^jshhNe}leUumGGw*VSF*hHXsDz=si{^; zd0eV(;u@1NT(MNpQ$q&?4)hL$|4WE0u5^u6&hgdbEdCPXEGkpkdvn>!XtlWH@D=m5VqN=lx^(_bEE{4{>9b;fXK|bL4+L(=|1F`eKC3rRs8OZW2tuXx;<-!A9b{ zoM~B|XSC`%(a5uL;lLC3-eSf6Z8|Quc&icfZH(*PuOA3KOJ|1ouh3?1#EkoF;Om@z zj`!iFm-@PMjXh;{_BW~00`?#*|4fpmzu9wT;B^swKRZizn~|I2D7kPW>cO-1>1w}# zm8ZN#7ryfArFJMb+|3Ky9SF(gHM#H9Za}STh{<7im(6AM-rE0RF)c`tAsw0h(Y;ij zqpQvZx7%SgUTQc~GgC5L{La}k=jY{1S)(K-2-8%q0(YP;(&(z!#wDM^hMA_Zo+{-8 z)W9EYTXJJ{n@fax=Z?4o-@MPy4NPhmU}+O;`?*R!&9`m$)-O6`!-HJ%-EnO^;RWaI zjHvrdc}09PUkucG99R`y!|i>}-ThQDwjL%{{d3pQC_Q!$AGq{X(`%(kGPT6wD_wT( zgJs`*{|7VEEW7Va(zCNq?u?64YyKmB0U2peq=ONW#z#AFZwl~&dk%3H7=JZ%6DV!Q znQkp!8-GqE;Ksh8ku@Xa7L|lRxXWu+I^Wc+)b$3oYS`Y*`}y;w!>_7-1jo3qsFX&c zZu0Z?aTZWb;HKSa4gWQa^#{*)t_X7#aYeL4X!lxZ>RRBsn0gP34~WSu1`W*eSU#S^Xo92A}W zSEA;aw+#-D@;1_eh2&#P)nS=W0$-I}=h_n{Sz5rHE@ z1vpvSW5il`MwW@;Us^Q5c-)R}TD1hRzRYb9@Siqb(O6^-k8#<1PCWDQ@lAN`PqVXA zJIYJO?6K#Mx@%GEKJmwg^Eq+xXc>I>+L~o$_J3p}s+jlJ;z)h&@XSu?}6UV)*^BeKUZ8DLDK*3uVE~`-!Z-%BbhnA_gJ!V<`h@x*m^3n z;V^7)C~iG~4e8ouWnOA^!&9kt03Pb%RL#iEcXA?{IcW=A-I>!Nkp0AG$?%}pYpXsm zJs{5O`nQJ`{qL3JhN3Xjk*H}a4dz|)>x2Oe%YVi#nlqkHNxVyRkyJV=9jZFPfww8Bl<+_i*qs` zFNbKk@Ptq_yiwtKs@TP`*x(it-o`6-J8`>qn+i(X%8akX2jjJ%vE6wN^3b|Q*Qn55 z)lEZ^h}Uz=J7*Bt({Hn>vcb>4DF7y{KRcW@&}L+2WA~YU5B$;e3a7GH)1SF%h_pEk zWTM7?3_}vbN()UdaVaaK#Ks?P(6f)a^*ZT?RkZuUV|V}L6W5B`RSsWNF3j3!MvCJ7 z0n{QVn?Onh{6KrL%&nSTtZ@m^D^d0_YJ%1>szS>w!%@{S&swn#%+T0o)56;AkGt9? z?9JUYdC@?S9#%KDEpX4l$Tp;2@v7X!<$C=yw5(soQMDb)euqTsp7Jqd&G;BwQk0~H zG9!fiN!bhHL#*v2-seOr;>M`@q46R!xJpKn@`SYf0seq{f+yZYb+=vyT8)fH1&a#pTb0#Ix2s07vko{h)p?di2Qn#s@^7gLPEj9) z%Rwe~Y~(%r?+&b5?ULwfM=1S^r~VcvnhLsaLG{k64S{~cqKRmJJbR?&eYS*9%*w|l zr+!;po`$?0a#{tmb>*qqJM$Y7^~!^oiOqQ{A2yKjPE**=Wl~=BFThvlFE59^tT;0z zhJWalupM6E&(>&BRkAY-%+tEH6(5H`0gY`NOtEDWlmT=+-%_LA7k!I34XPY6mYX5; z8^%+ae|5dt4%QUgoQlm5+3Jxu)~|E>)JlYbk_XKVuBpE|`(k0~Kpw*UuNnIWCtv~+ z#VxoGNeO=n#%q2ZpW$xX^_?FY0wK$17IN-%m-Z4lFXfL)GABKJm0g1#G2_TzRA{Bc zpCqwEW3>zdcMO8m!9{Mv8(@-Bc^RysVuq)@!1y-F8}b$p^r;?w=|q&WQ|wyS&)#-( zc)L1y{ZwVMW4ptUV%tz;$S~8fWuWMQXsyXLUFZI)%)t4AL#HBqhIM3M`>fxVw1377 zsYb`&zT+6^_&$GS>|=pP@d6640-}$Wh-oz4R6Z)-u7GKb<}&gVL9j zs>z$0w!fiU58qWCl05i$fz}$fmoI*7&_k}S5;AJL;&m4VCa#Fzu5t;w9b~-^8HM`Z zlu_khW*&}~PPzQYsHU5+xd{Q;)Jb-+foM;e?Jw7i?3I2EuL$Hyk+yjA3 z4a!}jqfv*wyrDvK=89KEjR6j|7@oe>;+Yc6F-Hj*vRghc~05+(VO>_|00>nJ)D zbYvQ2If`1?!EfNr9H^k}nyhAgdQW0iA7{3fsi8bSJ_;yp)ZRoS}_yc%ol za*EyjH(T*#E$?EEsWd)*7-B6G(8WJmvs()Bwp~WB9SM>vsSo&Sq{`YxXrdH25P2gy zqGi7(bSn%(e}?Ov8t94OX%mM0YZ_0ZAdF{t&uN1aMQ zCGwrN?V7jJVv~;1@>P7jz;63xK1QoQ3iZI(~Nr*sBXHH{er2N zp|Ev(Z7A27DD6}D{wFYvDaS5VDF!wkSojnTluIZ|n-N^n7BBzKT;O16gjW~h|7*w@0C0RH>;P*T!@utG zvy}jOGVBRi%M7u^0(X2DSb*ed#`H^q+~GB_PF_mWCFZ87n~bITf0` zhwiE@_RI;w)-qVe_UQVSwUo&*EJG zm<2u8Zb@%VNIl376=)FTWLoJ_^(Yuo2V0ExafkD%+1Z`H(K35hkD6QDj!aEyUc922 zBDRTT+lv{FUIX+#A=)zv869f689yV9Kk4-hP(_CFk`bFy6g4-2>p$i{b34dm@BDbR zzQ{rP+{)@^6Mcf{;Ojk`SA7p|Ti6ev$|u0^41WV2V3qTII=UnoXgl8p22T)MtaWPc zD%q7JLV^Qw6KGMvKP0h2+ppX4$+3zEL8x~~umIBS`spn3z?A{7iOq~UwuF8l|;)hijv%5s+8VBaCaMgPgS+**6=$kyGf~isRJ^~AQ;xGF;2=k zyZhy5^hEhiG_Ze1bScc20O{tR$Pq>{_yes5i4c)P-F?H&s>^t9FTG9p>cm-xJHSa+ zyKH9%X0e}?9wrbY9+&)Bv0s%qv@rPTytxE$3xqj-`SovXIfGx6#nt}V*qQqcK6mqk zIO$P2ulhXgCdbsEI`Tfl1LKE|8a5uUXx@s-F)E8#pO_FvF>b{e3&Z_MR3v@UWk48W zcpk6(5&O~na)c#>5z^78Z`W75MaCB^;JYpqJz4xx@igS_2A$VlLy&haR?rr&Jzngq z33LkNbm93Ek`k!8&TZC9f;q6el<#pb9|twm*p;zkpWkibb4K210{^;wCD6OB>!keqD1gukvsVa9{U4xolAOqbkaid4_<* zy{U|-xdW#86Es3LM=siy+Xx->t5;_ZmlU1a=JI6$)MDbb5k>wdAL>8DACO)!wSA*9r|$)gMM#HGvY#4_u3E9oxF- z7bncCmFf^^#z$K3yLOl0Y%pXha+Jgf(ScwHFLhq@-EfGhKg35ru7!*yIh`_gv+l%e z_OM9T-d){od0AL=R^+^@kZ0-C=E?i7^N@8cHo(}>G_ z@i*vA^TN*b#Bi2xslbQN`>209!0c0w@2``#-*SDmj*Vb@#X(H}#az>h=Q7eW?emC& zW7LoQOcmle+c#Fvb3DkVjK8FLZhE$LzvXmSpsX-aQIRTH_+>()VnY+AE!@CnQSBqN zHblWSMo4cQQKZ2mE`Jk3?L|t`whP&^Nz1Hx=B2W%3JV&0A2?BCr&wlR?;Ovb9WB1@ zD&bkCrhxehH>N-gfT;xxc6L2|OKaL$2!4Nf&%&Pn(8_q@$bS4OkfTK12gNZ^W)000 zCsOA>ArL@Aq456~Sb-g+%$ni%n*VMlLH+{ Date: Fri, 2 Aug 2024 08:56:18 +0000 Subject: [PATCH 9/9] Dont predict EOText token --- src/nanotron/data/chat_tokenizer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/nanotron/data/chat_tokenizer.py b/src/nanotron/data/chat_tokenizer.py index f8ff3b09..c3252925 100644 --- a/src/nanotron/data/chat_tokenizer.py +++ b/src/nanotron/data/chat_tokenizer.py @@ -54,7 +54,9 @@ def __call__(self, conversation: List[dict]) -> Tuple[List[int], List[bool]]: # Append <|end_of_text|> token tokens.extend(self.tokenizer.encode("<|end_of_text|>", add_special_tokens=False)) - is_completitions.append(True) + is_completitions.append( + False + ) # NOTE(tj.solergibert) No need to predict <|end_of_text|> token from <|eot_id|> token return tokens, is_completitions