getumbrel · edicristofaro · Aug 17, 2023 · Aug 17, 2023 · Aug 17, 2023 · Aug 17, 2023
diff --git a/api/13B.Dockerfile b/api/13B.Dockerfile
diff --git a/api/70B.Dockerfile b/api/70B.Dockerfile
diff --git a/api/Dockerfile b/api/Dockerfile
@@ -1,21 +1,16 @@
 # Define the image argument and provide a default value
-ARG IMAGE=ghcr.io/abetlen/llama-cpp-python:latest
-
-# Define the model file name and download url
-ARG MODEL_FILE=llama-2-7b-chat.bin
-ARG MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin
+ARG IMAGE=nvidia/cuda:12.2.0-devel-ubuntu22.04
 
 FROM ${IMAGE}
-
-ARG MODEL_FILE
-ARG MODEL_DOWNLOAD_URL
-
-# Download the model file
-RUN apt-get update -y && \
-    apt-get install --yes curl && \
-    mkdir -p /models && \
-    curl -L -o /models/${MODEL_FILE} ${MODEL_DOWNLOAD_URL}
-
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=America/Seattle
+RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+RUN pip install numpy diskcache uvicorn fastapi sse-starlette pydantic-settings
+ENV HOST=0.0.0.0
+ENV PORT=8000
+
+# Get llama-cpp-python
 WORKDIR /app
 
 COPY . .

diff --git a/api/run.sh b/api/run.sh
@@ -1,18 +1,16 @@
 #!/bin/bash
 
-make build
-
 # Get the number of available threads on the system
 n_threads=$(grep -c ^processor /proc/cpuinfo)
 
 # Define context window
 n_ctx=4096
 
-# Offload everything to CPU
-n_gpu_layers=0
+# Offload some to GPU - don't really know how to auto-calculate this
+n_gpu_layers=${GPU_LAYERS}
 
 # Define batch size
-n_batch=2096
+n_batch=2048
 # If total RAM is less than 8GB, set batch size to 1024
 total_ram=$(cat /proc/meminfo | grep MemTotal | awk '{print $2}')
 if [ $total_ram -lt 8000000 ]; then

diff --git a/docker-compose-13b.yml b/docker-compose-13b.yml
@@ -2,13 +2,25 @@ version: '3.6'
 
 services:
   llama-gpt-api:
-    # image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-13b-chat:latest'
     build:
       context: ./api
-      dockerfile: 13B.Dockerfile
+      dockerfile: Dockerfile
     restart: on-failure
+    volumes:
+      - ./models:/models
     environment:
-      MODEL: '/models/llama-2-13b-chat.bin'
+      MODEL: '/models/nous-hermes-llama2-13b.ggmlv3.q2_K.bin'
+      GPU_LAYERS: 45
+      USE_MLOCK: 1
+    cap_add:
+      - IPC_LOCK
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
 
   llama-gpt-ui:
     image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'

diff --git a/docker-compose-70b.yml b/docker-compose-70b.yml
@@ -2,13 +2,26 @@ version: '3.6'
 
 services:
   llama-gpt-api:
-    # image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-70b-chat:latest'
     build:
       context: ./api
-      dockerfile: 70B.Dockerfile
+      dockerfile: Dockerfile
     restart: on-failure
+    volumes:
+      - ./models:/models
     environment:
       MODEL: '/models/llama-2-70b-chat.bin'
+      N_GQA: '8'
+      GPU_LAYERS: 50
+      USE_MLOCK: 1
+    cap_add:
+      - IPC_LOCK
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
 
   llama-gpt-ui:
     image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -2,13 +2,25 @@ version: '3.6'
 
 services:
   llama-gpt-api:
-    # image: ghcr.io/getumbrel/llama-gpt-api
     build:
       context: ./api
       dockerfile: Dockerfile
     restart: on-failure
+    volumes:
+      - ./models:/models
     environment:
       MODEL: '/models/llama-2-7b-chat.bin'
+      GPU_LAYERS: 100
+      USE_MLOCK: 1
+    cap_add:
+      - IPC_LOCK
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
 
   llama-gpt-ui:
     image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
@@ -20,4 +32,4 @@ services:
       - 'OPENAI_API_HOST=http://llama-gpt-api:8000'
       - 'DEFAULT_MODEL=/models/llama-2-7b-chat.bin'
       - 'WAIT_HOSTS=llama-gpt-api:8000'
-      - 'WAIT_TIMEOUT=600'
+      - 'WAIT_TIMEOUT=600'