diff --git a/api/13B.Dockerfile b/api/13B.Dockerfile
deleted file mode 100644
index 474d77f..0000000
--- a/api/13B.Dockerfile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Define the image argument and provide a default value
-ARG IMAGE=ghcr.io/abetlen/llama-cpp-python:latest
-
-# Define the model file name and download url
-ARG MODEL_FILE=llama-2-13b-chat.bin
-ARG MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Nous-Hermes-Llama2-GGML/resolve/main/nous-hermes-llama2-13b.ggmlv3.q4_0.bin
-
-FROM ${IMAGE}
-
-ARG MODEL_FILE
-ARG MODEL_DOWNLOAD_URL
-
-# Download the model file
-RUN apt-get update -y && \
-    apt-get install --yes curl && \
-    mkdir -p /models && \
-    curl -L -o /models/${MODEL_FILE} ${MODEL_DOWNLOAD_URL}
-
-WORKDIR /app
-
-COPY . .
-
-EXPOSE 8000
-
-# Run the server start script
-CMD ["/bin/sh", "/app/run.sh"]
\ No newline at end of file
diff --git a/api/70B.Dockerfile b/api/70B.Dockerfile
deleted file mode 100644
index 036797b..0000000
--- a/api/70B.Dockerfile
+++ /dev/null
@@ -1,26 +0,0 @@
-# Define the image argument and provide a default value
-ARG IMAGE=ghcr.io/abetlen/llama-cpp-python:latest
-
-# Define the model file name and download url
-ARG MODEL_FILE=llama-2-70b-chat.bin
-ARG MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGML/resolve/main/llama-2-70b-chat.ggmlv3.q4_0.bin
-
-FROM ${IMAGE}
-
-ARG MODEL_FILE
-ARG MODEL_DOWNLOAD_URL
-
-# Download the model file
-RUN apt-get update -y && \
-    apt-get install --yes curl && \
-    mkdir -p /models && \
-    curl -L -o /models/${MODEL_FILE} ${MODEL_DOWNLOAD_URL}
-
-WORKDIR /app
-
-COPY . .
-
-EXPOSE 8000
-
-# Run the server start script
-CMD ["/bin/sh", "/app/run.sh"]
\ No newline at end of file
diff --git a/api/Dockerfile b/api/Dockerfile
index 731a2b8..ce9d4e5 100644
--- a/api/Dockerfile
+++ b/api/Dockerfile
@@ -1,21 +1,16 @@
 # Define the image argument and provide a default value
-ARG IMAGE=ghcr.io/abetlen/llama-cpp-python:latest
-
-# Define the model file name and download url
-ARG MODEL_FILE=llama-2-7b-chat.bin
-ARG MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin
+ARG IMAGE=nvidia/cuda:12.2.0-devel-ubuntu22.04
 
 FROM ${IMAGE}
-
-ARG MODEL_FILE
-ARG MODEL_DOWNLOAD_URL
-
-# Download the model file
-RUN apt-get update -y && \
-    apt-get install --yes curl && \
-    mkdir -p /models && \
-    curl -L -o /models/${MODEL_FILE} ${MODEL_DOWNLOAD_URL}
-
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=America/Seattle
+RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
+RUN pip install numpy diskcache uvicorn fastapi sse-starlette pydantic-settings
+ENV HOST=0.0.0.0
+ENV PORT=8000
+
+# Get llama-cpp-python
 WORKDIR /app
 
 COPY . .
diff --git a/api/run.sh b/api/run.sh
index 38d8118..80f8ba2 100755
--- a/api/run.sh
+++ b/api/run.sh
@@ -1,18 +1,16 @@
 #!/bin/bash
 
-make build
-
 # Get the number of available threads on the system
 n_threads=$(grep -c ^processor /proc/cpuinfo)
 
 # Define context window
 n_ctx=4096
 
-# Offload everything to CPU
-n_gpu_layers=0
+# Offload some to GPU - don't really know how to auto-calculate this
+n_gpu_layers=${GPU_LAYERS}
 
 # Define batch size
-n_batch=2096
+n_batch=2048
 # If total RAM is less than 8GB, set batch size to 1024
 total_ram=$(cat /proc/meminfo | grep MemTotal | awk '{print $2}')
 if [ $total_ram -lt 8000000 ]; then
diff --git a/docker-compose-13b.yml b/docker-compose-13b.yml
index ce1c5fc..cc2695a 100644
--- a/docker-compose-13b.yml
+++ b/docker-compose-13b.yml
@@ -2,13 +2,25 @@ version: '3.6'
 
 services:
   llama-gpt-api:
-    # image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-13b-chat:latest'
     build:
       context: ./api
-      dockerfile: 13B.Dockerfile
+      dockerfile: Dockerfile
     restart: on-failure
+    volumes:
+      - ./models:/models
     environment:
-      MODEL: '/models/llama-2-13b-chat.bin'
+      MODEL: '/models/nous-hermes-llama2-13b.ggmlv3.q2_K.bin'
+      GPU_LAYERS: 45
+      USE_MLOCK: 1
+    cap_add:
+      - IPC_LOCK
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
 
   llama-gpt-ui:
     image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
diff --git a/docker-compose-70b.yml b/docker-compose-70b.yml
index 80d4aac..cd9d31d 100644
--- a/docker-compose-70b.yml
+++ b/docker-compose-70b.yml
@@ -2,13 +2,26 @@ version: '3.6'
 
 services:
   llama-gpt-api:
-    # image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-70b-chat:latest'
     build:
       context: ./api
-      dockerfile: 70B.Dockerfile
+      dockerfile: Dockerfile
     restart: on-failure
+    volumes:
+      - ./models:/models
     environment:
       MODEL: '/models/llama-2-70b-chat.bin'
+      N_GQA: '8'
+      GPU_LAYERS: 50
+      USE_MLOCK: 1
+    cap_add:
+      - IPC_LOCK
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
 
   llama-gpt-ui:
     image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
diff --git a/docker-compose.yml b/docker-compose.yml
index 4d9a9a4..89ed753 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -2,13 +2,25 @@ version: '3.6'
 
 services:
   llama-gpt-api:
-    # image: ghcr.io/getumbrel/llama-gpt-api
     build:
       context: ./api
       dockerfile: Dockerfile
     restart: on-failure
+    volumes:
+      - ./models:/models
     environment:
       MODEL: '/models/llama-2-7b-chat.bin'
+      GPU_LAYERS: 100
+      USE_MLOCK: 1
+    cap_add:
+      - IPC_LOCK
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
 
   llama-gpt-ui:
     image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
@@ -20,4 +32,4 @@ services:
       - 'OPENAI_API_HOST=http://llama-gpt-api:8000'
       - 'DEFAULT_MODEL=/models/llama-2-7b-chat.bin'
       - 'WAIT_HOSTS=llama-gpt-api:8000'
-      - 'WAIT_TIMEOUT=600'
+      - 'WAIT_TIMEOUT=600'
\ No newline at end of file