diff --git a/api/13B.Dockerfile b/api/13B.Dockerfile deleted file mode 100644 index 474d77f..0000000 --- a/api/13B.Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# Define the image argument and provide a default value -ARG IMAGE=ghcr.io/abetlen/llama-cpp-python:latest - -# Define the model file name and download url -ARG MODEL_FILE=llama-2-13b-chat.bin -ARG MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Nous-Hermes-Llama2-GGML/resolve/main/nous-hermes-llama2-13b.ggmlv3.q4_0.bin - -FROM ${IMAGE} - -ARG MODEL_FILE -ARG MODEL_DOWNLOAD_URL - -# Download the model file -RUN apt-get update -y && \ - apt-get install --yes curl && \ - mkdir -p /models && \ - curl -L -o /models/${MODEL_FILE} ${MODEL_DOWNLOAD_URL} - -WORKDIR /app - -COPY . . - -EXPOSE 8000 - -# Run the server start script -CMD ["/bin/sh", "/app/run.sh"] \ No newline at end of file diff --git a/api/70B.Dockerfile b/api/70B.Dockerfile deleted file mode 100644 index 036797b..0000000 --- a/api/70B.Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -# Define the image argument and provide a default value -ARG IMAGE=ghcr.io/abetlen/llama-cpp-python:latest - -# Define the model file name and download url -ARG MODEL_FILE=llama-2-70b-chat.bin -ARG MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Llama-2-70B-Chat-GGML/resolve/main/llama-2-70b-chat.ggmlv3.q4_0.bin - -FROM ${IMAGE} - -ARG MODEL_FILE -ARG MODEL_DOWNLOAD_URL - -# Download the model file -RUN apt-get update -y && \ - apt-get install --yes curl && \ - mkdir -p /models && \ - curl -L -o /models/${MODEL_FILE} ${MODEL_DOWNLOAD_URL} - -WORKDIR /app - -COPY . . - -EXPOSE 8000 - -# Run the server start script -CMD ["/bin/sh", "/app/run.sh"] \ No newline at end of file diff --git a/api/Dockerfile b/api/Dockerfile index 731a2b8..ce9d4e5 100644 --- a/api/Dockerfile +++ b/api/Dockerfile @@ -1,21 +1,16 @@ # Define the image argument and provide a default value -ARG IMAGE=ghcr.io/abetlen/llama-cpp-python:latest - -# Define the model file name and download url -ARG MODEL_FILE=llama-2-7b-chat.bin -ARG MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin +ARG IMAGE=nvidia/cuda:12.2.0-devel-ubuntu22.04 FROM ${IMAGE} - -ARG MODEL_FILE -ARG MODEL_DOWNLOAD_URL - -# Download the model file -RUN apt-get update -y && \ - apt-get install --yes curl && \ - mkdir -p /models && \ - curl -L -o /models/${MODEL_FILE} ${MODEL_DOWNLOAD_URL} - +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=America/Seattle +RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake +RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python +RUN pip install numpy diskcache uvicorn fastapi sse-starlette pydantic-settings +ENV HOST=0.0.0.0 +ENV PORT=8000 + +# Get llama-cpp-python WORKDIR /app COPY . . diff --git a/api/run.sh b/api/run.sh index 38d8118..80f8ba2 100755 --- a/api/run.sh +++ b/api/run.sh @@ -1,18 +1,16 @@ #!/bin/bash -make build - # Get the number of available threads on the system n_threads=$(grep -c ^processor /proc/cpuinfo) # Define context window n_ctx=4096 -# Offload everything to CPU -n_gpu_layers=0 +# Offload some to GPU - don't really know how to auto-calculate this +n_gpu_layers=${GPU_LAYERS} # Define batch size -n_batch=2096 +n_batch=2048 # If total RAM is less than 8GB, set batch size to 1024 total_ram=$(cat /proc/meminfo | grep MemTotal | awk '{print $2}') if [ $total_ram -lt 8000000 ]; then diff --git a/docker-compose-13b.yml b/docker-compose-13b.yml index ce1c5fc..cc2695a 100644 --- a/docker-compose-13b.yml +++ b/docker-compose-13b.yml @@ -2,13 +2,25 @@ version: '3.6' services: llama-gpt-api: - # image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-13b-chat:latest' build: context: ./api - dockerfile: 13B.Dockerfile + dockerfile: Dockerfile restart: on-failure + volumes: + - ./models:/models environment: - MODEL: '/models/llama-2-13b-chat.bin' + MODEL: '/models/nous-hermes-llama2-13b.ggmlv3.q2_K.bin' + GPU_LAYERS: 45 + USE_MLOCK: 1 + cap_add: + - IPC_LOCK + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] llama-gpt-ui: image: 'ghcr.io/getumbrel/llama-gpt-ui:latest' diff --git a/docker-compose-70b.yml b/docker-compose-70b.yml index 80d4aac..cd9d31d 100644 --- a/docker-compose-70b.yml +++ b/docker-compose-70b.yml @@ -2,13 +2,26 @@ version: '3.6' services: llama-gpt-api: - # image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-70b-chat:latest' build: context: ./api - dockerfile: 70B.Dockerfile + dockerfile: Dockerfile restart: on-failure + volumes: + - ./models:/models environment: MODEL: '/models/llama-2-70b-chat.bin' + N_GQA: '8' + GPU_LAYERS: 50 + USE_MLOCK: 1 + cap_add: + - IPC_LOCK + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] llama-gpt-ui: image: 'ghcr.io/getumbrel/llama-gpt-ui:latest' diff --git a/docker-compose.yml b/docker-compose.yml index 4d9a9a4..89ed753 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,13 +2,25 @@ version: '3.6' services: llama-gpt-api: - # image: ghcr.io/getumbrel/llama-gpt-api build: context: ./api dockerfile: Dockerfile restart: on-failure + volumes: + - ./models:/models environment: MODEL: '/models/llama-2-7b-chat.bin' + GPU_LAYERS: 100 + USE_MLOCK: 1 + cap_add: + - IPC_LOCK + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] llama-gpt-ui: image: 'ghcr.io/getumbrel/llama-gpt-ui:latest' @@ -20,4 +32,4 @@ services: - 'OPENAI_API_HOST=http://llama-gpt-api:8000' - 'DEFAULT_MODEL=/models/llama-2-7b-chat.bin' - 'WAIT_HOSTS=llama-gpt-api:8000' - - 'WAIT_TIMEOUT=600' + - 'WAIT_TIMEOUT=600' \ No newline at end of file