Misc changes and fixes for llama cpp (#236)

huggingface · Jul 31, 2024 · 9197108 · 9197108
1 parent 0aac010
commit 9197108
Show file tree

Hide file tree

Showing 18 changed files with 187 additions and 239 deletions.
diff --git a/.github/workflows/test_api_misc.yaml b/.github/workflows/test_api_misc.yaml
@@ -49,4 +49,4 @@ jobs:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
           PUSH_REPO_ID: optimum-benchmark/misc
         run: |
-          pytest -s -k "api and not (cpu or cuda)"
+          pytest -s -k "api and not (cpu or cuda or rocm or mps)"
diff --git a/.github/workflows/test_cli_llama_cpp.yaml → ...hub/workflows/test_cli_cpu_llama_cpp.yaml b/.github/workflows/test_cli_llama_cpp.yaml → ...hub/workflows/test_cli_cpu_llama_cpp.yaml
@@ -1,4 +1,4 @@
-name: CLI Llama.cpp Tests
+name: CLI CPU LlamaCpp Tests
 
 on:
   workflow_dispatch:
@@ -26,23 +26,23 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
 
 jobs:
-  run_cli_llama_cpp_tests:
+  run_cli_cpu_llama_cpp_tests:
     runs-on: ubuntu-latest
 
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Python 3.10
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v5
         with:
           python-version: "3.10"
 
       - name: Install requirements
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install -e .[testing,lamma-cpp]
+          pip install -e .[testing,llama-cpp]
 
       - name: Run tests
         run: pytest -s -k "llama_cpp"
diff --git a/.github/workflows/test_cli_misc.yaml b/.github/workflows/test_cli_misc.yaml
@@ -52,7 +52,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install -e .[testing]
+          pip install -e .[testing,timm,diffusers,codecarbon]
 
       - name: Run tests
-        run: pytest -s -k "cli and not (cpu or cuda)"
+        run: pytest -s -k "cli and not (cpu or cuda or rocm or mps)"
diff --git a/README.md b/README.md
@@ -13,6 +13,7 @@ Optimum-Benchmark is a unified [multi-backend & multi-device](#backends--devices
 
 *News* 📰
 
+- LlamaCpp backend for benchmarking [`llama-cpp-python`](https://github.com/abetlen/llama-cpp-python) bindings with all its supported devices 🚀
 - 🥳 PyPI package is now available for installation: `pip install optimum-benchmark` 🎉 [check it out](https://pypi.org/project/optimum-benchmark/) !
 - Model loading latency/memory/energy tracking for all backends in the inference scenario 🚀
 - numactl support for Process and Torchrun launchers to control the NUMA nodes on which the benchmark runs.
@@ -47,18 +48,20 @@ Optimum-Benchmark is continuously and intensively tested on a variety of devices
 
 ### CLI 📈
 
+[![CLI_CPU_LLAMA_CPP](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_llama_cpp.yaml)
 [![CLI_CPU_NEURAL_COMPRESSOR](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_neural_compressor.yaml)
 [![CLI_CPU_ONNXRUNTIME](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_onnxruntime.yaml)
 [![CLI_CPU_OPENVINO](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_openvino.yaml)
 [![CLI_CPU_PYTORCH](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_pytorch.yaml)
 [![CLI_CPU_PY_TXI](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_py_txi.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cpu_py_txi.yaml)
 [![CLI_CUDA_ONNXRUNTIME](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_onnxruntime.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_onnxruntime.yaml)
-[![CLI_CUDA_VLLM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm.yaml)
 [![CLI_CUDA_PYTORCH_MULTI_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch_multi_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch_multi_gpu.yaml)
 [![CLI_CUDA_PYTORCH_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_pytorch_single_gpu.yaml)
-[![CLI_CUDA_TENSORRT_LLM](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm.yaml)
+[![CLI_CUDA_PY_TXI](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_py_txi.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_py_txi.yaml)
+[![CLI_CUDA_TENSORRT_LLM_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_tensorrt_llm_single_gpu.yaml)
 [![CLI_CUDA_TORCH_ORT_MULTI_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort_multi_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort_multi_gpu.yaml)
 [![CLI_CUDA_TORCH_ORT_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_torch_ort_single_gpu.yaml)
+[![CLI_CUDA_VLLM_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_cuda_vllm_single_gpu.yaml)
 [![CLI_MISC](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_misc.yaml)
 [![CLI_ROCM_PYTORCH_MULTI_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch_multi_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch_multi_gpu.yaml)
 [![CLI_ROCM_PYTORCH_SINGLE_GPU](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch_single_gpu.yaml/badge.svg)](https://github.com/huggingface/optimum-benchmark/actions/workflows/test_cli_rocm_pytorch_single_gpu.yaml)

diff --git a/optimum_benchmark/backends/base.py b/optimum_benchmark/backends/base.py
@@ -68,20 +68,29 @@ def __init__(self, config: BackendConfigT):
             self.automodel_loader = get_timm_automodel_loader()
             self.pretrained_processor = None
             self.generation_config = None
+
         elif self.config.library == "llama_cpp":
-            self.logger.info("\t+ Benchmarking a Llama.cpp model")
-            self.model_shapes = {}
+            self.logger.info("\t+ Benchmarking a LlamaCpp model")
+            self.pretrained_processor = None
+            self.generation_config = None
+            self.pretrained_config = None
+            self.automodel_loader = None
+            # TOD: need a custom method to extract shapes from gguf
+            self.model_shapes = extract_transformers_shapes_from_artifacts(
+                self.pretrained_config, self.pretrained_processor
+            )
+
         else:
             self.logger.info("\t+ Benchmarking a Transformers model")
             self.generation_config = get_transformers_generation_config(self.config.model, **self.config.model_kwargs)
             self.pretrained_config = get_transformers_pretrained_config(self.config.model, **self.config.model_kwargs)
+            self.automodel_loader = get_transformers_automodel_loader_for_task(self.config.task)
             self.pretrained_processor = get_transformers_pretrained_processor(
                 self.config.processor, **self.config.processor_kwargs
             )
             self.model_shapes = extract_transformers_shapes_from_artifacts(
                 self.pretrained_config, self.pretrained_processor
             )
-            self.automodel_loader = get_transformers_automodel_loader_for_task(self.config.task)
 
     def seed(self) -> None:
         set_seed(self.config.seed)

diff --git a/optimum_benchmark/backends/config.py b/optimum_benchmark/backends/config.py
@@ -52,17 +52,27 @@ def __post_init__(self):
             self.processor = self.model
 
         # TODO: add cache_dir, token, etc. to these methods
+        if self.library is None:
+            self.library = infer_library_from_model_name_or_path(
+                self.model,
+                revision=self.model_kwargs.get("revision", None),
+                token=self.model_kwargs.get("token", None),
+            )
+
         if self.task is None:
             self.task = infer_task_from_model_name_or_path(
-                self.model, self.model_kwargs.get("revision", None), self.library
+                self.model,
+                self.library,
+                revision=self.model_kwargs.get("revision", None),
+                token=self.model_kwargs.get("token", None),
             )
 
-        if self.library is None:
-            self.library = infer_library_from_model_name_or_path(self.model, self.model_kwargs.get("revision", None))
-
         if self.model_type is None:
             self.model_type = infer_model_type_from_model_name_or_path(
-                self.model, self.model_kwargs.get("revision", None), self.library
+                self.model,
+                self.library,
+                revision=self.model_kwargs.get("revision", None),
+                token=self.model_kwargs.get("token", None),
             )
 
         if self.device is None:

diff --git a/optimum_benchmark/backends/diffusers_utils.py b/optimum_benchmark/backends/diffusers_utils.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Dict
 
 from hydra.utils import get_class
@@ -38,7 +39,9 @@
 
 
 def get_diffusers_pretrained_config(model: str, **kwargs) -> Dict[str, int]:
-    return DiffusionPipeline.load_config(model, **kwargs)
+    config = DiffusionPipeline.load_config(model, **kwargs)
+    pipeline_config = config[0] if isinstance(config, tuple) else config
+    return pipeline_config
 
 
 def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
@@ -62,6 +65,7 @@ def extract_diffusers_shapes_from_model(model: str, **kwargs) -> Dict[str, int]:
         shapes["width"] = vae_config["sample_size"]
 
     else:
+        warnings.warn("Could not extract shapes [num_channels, height, width] from diffusion pipeline.")
         shapes["num_channels"] = -1
         shapes["height"] = -1
         shapes["width"] = -1

diff --git a/optimum_benchmark/backends/llama_cpp/backend.py b/optimum_benchmark/backends/llama_cpp/backend.py
@@ -1,5 +1,5 @@
 from tempfile import TemporaryDirectory
-from typing import Any, Dict, Tuple
+from typing import Any, Dict
 
 from llama_cpp import Llama
 
@@ -10,13 +10,11 @@
 class LlamaCppBackend(Backend[LlamaCppConfig]):
     NAME: str = "llama_cpp"
 
+    pretrained_model: Llama
+
     def __init__(self, config: LlamaCppConfig) -> None:
         super().__init__(config)
 
-        if self.config.no_weights:
-            self.logger.info("\t+ Loading no weights model")
-            raise NotImplementedError("No weights model is not yet implemented")
-
     def load(self) -> None:
         self.logger.info("\t+ Creating backend temporary directory")
         self.tmpdir = TemporaryDirectory()
@@ -28,65 +26,44 @@ def load_model_from_pretrained(self) -> None:
         """
         Load the pretrained model from the given model name (normally GGUF, GGML)
         """
-        embedding = True if self.config.task == "feature-extraction" else False
 
         self.pretrained_model = Llama.from_pretrained(
-            repo_id=self.config.model,  # type: ignore
+            repo_id=self.config.model,
             filename=self.config.filename,
-            verbose=False,
-            echo=False,
-            embedding=embedding,
-        )  # type: ignore
+            **self.llama_cpp_kwargs,
+        )
+
+    @property
+    def llama_cpp_kwargs(self) -> Dict[str, Any]:
+        return {
+            "embedding": self.config.task == "feature-extraction",
+            "verbose": False,
+            "echo": False,
+        }
+
+    def prepare_input_shapes(self, input_shapes: Dict[str, Any]) -> Dict[str, Any]:
+        if self.config.task == "text-generation":
+            if input_shapes["batch_size"] != 1:
+                raise ValueError("Batch size must be 1 for LlamaCpp text generation")
 
-    def validate_task(self) -> None:
-        if self.config.task not in ["text-generation"]:
-            raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
+        return input_shapes
 
-    def prepare_inputs(self, inputs: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    def prepare_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.config.task == "text-generation":
-            if inputs["input_ids"].shape[0] != 1:
-                raise ValueError("Batch size must be 1 for Llama.cpp text generation")
-
-            inputs = super().prepare_inputs(inputs)
-            inputs["tokens"] = inputs["input_ids"].squeeze()
+            return {"tokens": inputs["input_ids"].squeeze(0).tolist()}
 
-            return inputs
         elif self.config.task == "feature-extraction":
-            detokenized_batch = list(map(self.pretrained_model.detokenize, inputs["input_ids"]))
-            decoded_batch = [x.decode("utf-8") for x in detokenized_batch]
-
-            inputs["input_str"] = decoded_batch
-            return inputs
+            return {"input": [self.pretrained_model.detokenize(x).decode("utf-8") for x in inputs["input_ids"]]}
 
         raise ValueError(f"Task {self.config.task} not supported by {self.NAME}")
 
     def forward(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> Any:
-        """
-        Forward pass of the model\
-        Get the embeddings of the input tokens
-        """
-
-        return self.pretrained_model.embed(inputs["input_str"])
+        self.pretrained_model.embed(**inputs)
 
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
-        """
-        Prefill the model with the input tokens
-        We consider prefill as the time to first token, thus we evaluate the time it takes for the model to generate the first token
-        """
-
-        next(self.pretrained_model.generate(tokens=inputs["tokens"]))
-        return inputs
+        next(self.pretrained_model.generate(**inputs))
 
     def generate(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
-        """
-        Generate new tokens from the pretrained model
-        """
-
-        output = []
-
-        for token in self.pretrained_model.generate(tokens=inputs["tokens"]):
-            output.append(token)
-            if len(output) >= kwargs["max_new_tokens"]:
-                break
-
-        return output
+        generator = self.pretrained_model.generate(**inputs)
+        for _ in range(kwargs["max_new_tokens"]):
+            next(generator)
diff --git a/optimum_benchmark/backends/llama_cpp/config.py b/optimum_benchmark/backends/llama_cpp/config.py
@@ -1,16 +1,9 @@
 from dataclasses import dataclass
-from logging import getLogger
 from typing import Optional
 
 from ...import_utils import llama_cpp_version
 from ..config import BackendConfig
 
-LOGGER = getLogger("backend")
-
-
-def llama_cpp_model_kwargs():
-    return {"verbose": True}
-
 
 @dataclass
 class LlamaCppConfig(BackendConfig):
@@ -19,16 +12,16 @@ class LlamaCppConfig(BackendConfig):
     _target_: str = "optimum_benchmark.backends.llama_cpp.backend.LlamaCppBackend"
 
     no_weights: bool = False
-    library: str = "llama_cpp"
     filename: Optional[str] = None
 
     def __post_init__(self):
-        super().__post_init__()
-
-        self.device = self.device.lower()  # type: ignore
         self.library = "llama_cpp"
+        self.model_type = "llama_cpp"
+
+        super().__post_init__()
 
-        if self.device not in ["cuda", "mps", "cpu"]:
-            raise ValueError(f"Llama.cpp Backend only supports 'cpu', 'mps' and 'cuda' devices, got {self.device}")
+        if self.task not in ["feature-extraction", "text-generation"]:
+            raise NotImplementedError(f"Task {self.task} is not supported by LlamaCpp backend.")
 
-        LOGGER.warning("Llama.cpp automatically selects the device, ignoring the device parameter in the config.")
+        if self.no_weights:
+            raise NotImplementedError("`no_weights` benchmarking is not supported by LlamaCpp backend.")
diff --git a/optimum_benchmark/backends/pytorch/backend.py b/optimum_benchmark/backends/pytorch/backend.py
@@ -441,6 +441,7 @@ def train(
         training_data_collator: Callable[[List[Dict[str, Any]]], Dict[str, Any]],
     ) -> TrainerState:
         self.logger.info(f"\t+ Wrapping training arguments with {TrainingArguments.__name__}")
+        training_arguments["use_cpu"] = self.config.device == "cpu"
         training_arguments = TrainingArguments(**training_arguments)
         self.logger.info(f"\t+ Wrapping model with {Trainer.__name__}")
         trainer = Trainer(

diff --git a/optimum_benchmark/backends/transformers_utils.py b/optimum_benchmark/backends/transformers_utils.py
@@ -97,12 +97,19 @@ def get_transformers_pretrained_processor(model: str, **kwargs) -> Optional["Pre
 
 
 def extract_transformers_shapes_from_artifacts(
-    config: "PretrainedConfig", processor: Optional["PretrainedProcessor"] = None
+    config: Optional["PretrainedConfig"] = None, processor: Optional["PretrainedProcessor"] = None
 ) -> Dict[str, Any]:
     artifacts_dict = {}
 
-    config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
-    artifacts_dict.update(config_dict)
+    if config is not None and hasattr(config, "to_dict"):
+        config_dict = {k: v for k, v in config.to_dict().items() if v is not None}
+        artifacts_dict.update(config_dict)
+    elif config is not None:
+        try:
+            config_dict = {k: getattr(config, k) for k in dir(config) if isinstance(getattr(config, k), int)}
+            artifacts_dict.update(config_dict)
+        except Exception:
+            warnings.warn(f"Could not extract shapes from config {config}")
 
     if processor is not None and hasattr(processor, "to_dict"):
         processor_dict = {k: v for k, v in processor.to_dict().items() if v is not None}