Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NVIDIA GPU Support #11

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 0 additions & 26 deletions api/13B.Dockerfile

This file was deleted.

26 changes: 0 additions & 26 deletions api/70B.Dockerfile

This file was deleted.

25 changes: 10 additions & 15 deletions api/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
# Define the image argument and provide a default value
ARG IMAGE=ghcr.io/abetlen/llama-cpp-python:latest

# Define the model file name and download url
ARG MODEL_FILE=llama-2-7b-chat.bin
ARG MODEL_DOWNLOAD_URL=https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin
ARG IMAGE=nvidia/cuda:12.2.0-devel-ubuntu22.04

FROM ${IMAGE}

ARG MODEL_FILE
ARG MODEL_DOWNLOAD_URL

# Download the model file
RUN apt-get update -y && \
apt-get install --yes curl && \
mkdir -p /models && \
curl -L -o /models/${MODEL_FILE} ${MODEL_DOWNLOAD_URL}

ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=America/Seattle
RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-pip git cmake
RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
RUN pip install numpy diskcache uvicorn fastapi sse-starlette pydantic-settings
ENV HOST=0.0.0.0
ENV PORT=8000

# Get llama-cpp-python
WORKDIR /app

COPY . .
Expand Down
8 changes: 3 additions & 5 deletions api/run.sh
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
#!/bin/bash

make build

# Get the number of available threads on the system
n_threads=$(grep -c ^processor /proc/cpuinfo)

# Define context window
n_ctx=4096

# Offload everything to CPU
n_gpu_layers=0
# Offload some to GPU - don't really know how to auto-calculate this
n_gpu_layers=${GPU_LAYERS}

# Define batch size
n_batch=2096
n_batch=2048
# If total RAM is less than 8GB, set batch size to 1024
total_ram=$(cat /proc/meminfo | grep MemTotal | awk '{print $2}')
if [ $total_ram -lt 8000000 ]; then
Expand Down
18 changes: 15 additions & 3 deletions docker-compose-13b.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,25 @@ version: '3.6'

services:
llama-gpt-api:
# image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-13b-chat:latest'
build:
context: ./api
dockerfile: 13B.Dockerfile
dockerfile: Dockerfile
restart: on-failure
volumes:
- ./models:/models
environment:
MODEL: '/models/llama-2-13b-chat.bin'
MODEL: '/models/nous-hermes-llama2-13b.ggmlv3.q2_K.bin'
GPU_LAYERS: 45
USE_MLOCK: 1
cap_add:
- IPC_LOCK
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

llama-gpt-ui:
image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
Expand Down
17 changes: 15 additions & 2 deletions docker-compose-70b.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,26 @@ version: '3.6'

services:
llama-gpt-api:
# image: 'ghcr.io/getumbrel/llama-gpt-api-llama-2-70b-chat:latest'
build:
context: ./api
dockerfile: 70B.Dockerfile
dockerfile: Dockerfile
restart: on-failure
volumes:
- ./models:/models
environment:
MODEL: '/models/llama-2-70b-chat.bin'
N_GQA: '8'
GPU_LAYERS: 50
USE_MLOCK: 1
cap_add:
- IPC_LOCK
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

llama-gpt-ui:
image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
Expand Down
16 changes: 14 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,25 @@ version: '3.6'

services:
llama-gpt-api:
# image: ghcr.io/getumbrel/llama-gpt-api
build:
context: ./api
dockerfile: Dockerfile
restart: on-failure
volumes:
- ./models:/models
environment:
MODEL: '/models/llama-2-7b-chat.bin'
GPU_LAYERS: 100
USE_MLOCK: 1
cap_add:
- IPC_LOCK
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

llama-gpt-ui:
image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
Expand All @@ -20,4 +32,4 @@ services:
- 'OPENAI_API_HOST=http://llama-gpt-api:8000'
- 'DEFAULT_MODEL=/models/llama-2-7b-chat.bin'
- 'WAIT_HOSTS=llama-gpt-api:8000'
- 'WAIT_TIMEOUT=600'
- 'WAIT_TIMEOUT=600'