Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(llama.cpp): add distributed llama.cpp inferencing #2324

Merged
merged 8 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@
### Define the number of parallel LLAMA.cpp workers (Defaults to 1)
# LLAMACPP_PARALLEL=1

### Define a list of GRPC Servers for llama-cpp workers to distribute the load
# https://github.com/ggerganov/llama.cpp/pull/6829
# https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md
# LLAMACPP_GRPC_SERVERS=""

### Enable to run parallel requests
# LOCALAI_PARALLEL_REQUESTS=true

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential ffmpeg protobuf-compiler
sudo apt-get install build-essential ffmpeg protobuf-compiler ccache
- name: Install CUDA Dependencies
run: |
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
Expand Down Expand Up @@ -86,7 +86,7 @@ jobs:
cache: false
- name: Dependencies
run: |
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler
sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@latest
go install google.golang.org/protobuf/cmd/protoc-gen-go@latest
- name: Build stablediffusion
Expand Down
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ ARG GO_TAGS="stablediffusion tinydream tts"
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ccache \
ca-certificates \
cmake \
curl \
Expand Down
17 changes: 15 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ BINARY_NAME=local-ai

# llama.cpp versions
GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=dc685be46622a8fabfd57cfa804237c8f15679b8
CPPLLAMA_VERSION?=4f0263633b40e94e8b69fd6e7e4395cfedfd5c12

# gpt4all version
GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
Expand Down Expand Up @@ -158,6 +158,8 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/gpt4all
ALL_GRPC_BACKENDS+=backend-assets/grpc/rwkv
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
Expand Down Expand Up @@ -314,7 +316,7 @@ build: prepare backend-assets grpcs ## Build the project
CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o $(BINARY_NAME) ./

build-minimal:
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp" GO_TAGS=none $(MAKE) build
BUILD_GRPC_FOR_BACKEND_LLAMA=true GRPC_BACKENDS="backend-assets/grpc/llama-cpp-avx2" GO_TAGS=none $(MAKE) build

build-api:
BUILD_GRPC_FOR_BACKEND_LLAMA=true BUILD_API_ONLY=true GO_TAGS=none $(MAKE) build
Expand Down Expand Up @@ -691,6 +693,17 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda

backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
cp -rf backend/cpp/llama backend/cpp/llama-grpc
$(MAKE) -C backend/cpp/llama-grpc purge
$(info ${GREEN}I llama-cpp build info:grpc${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc

backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server

backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
Expand Down
6 changes: 6 additions & 0 deletions backend/cpp/llama/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2217,6 +2217,12 @@ static void params_parse(const backend::ModelOptions* request,
} else {
params.n_parallel = 1;
}

const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
if (llama_grpc_servers != NULL) {
params.rpc_servers = std::string(llama_grpc_servers);
}

// TODO: Add yarn

if (!request->tensorsplit().empty()) {
Expand Down
9 changes: 5 additions & 4 deletions core/cli/cli.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ type Context struct {
var CLI struct {
Context `embed:""`

Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
Run RunCMD `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
Models ModelsCMD `cmd:"" help:"Manage LocalAI models and definitions"`
TTS TTSCMD `cmd:"" help:"Convert text to speech"`
Transcript TranscriptCMD `cmd:"" help:"Convert audio to text"`
LLAMACPPWorker LLAMACPPWorkerCMD `cmd:"" help:"Run workers to distribute workload (llama.cpp-only)"`
}
37 changes: 37 additions & 0 deletions core/cli/llamacppworker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package cli

import (
"os"
"syscall"

"github.com/go-skynet/LocalAI/pkg/assets"
"github.com/rs/zerolog/log"
)

type LLAMACPPWorkerCMD struct {
Args []string `arg:"" optional:"" name:"models" help:"Worker arguments: host port"`
BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
}

func (r *LLAMACPPWorkerCMD) Run(ctx *Context) error {
// Extract files from the embedded FS
err := assets.ExtractFiles(ctx.BackendAssets, r.BackendAssetsPath)
log.Debug().Msgf("Extracting backend assets files to %s", r.BackendAssetsPath)
if err != nil {
log.Warn().Msgf("Failed extracting backend assets files: %s (might be required for some backends to work properly, like gpt4all)", err)
}

return syscall.Exec(
assets.ResolvePath(
r.BackendAssetsPath,
"util",
"llama-cpp-rpc-server",
),
append([]string{
assets.ResolvePath(
r.BackendAssetsPath,
"util",
"llama-cpp-rpc-server",
)}, r.Args...),
os.Environ())
Dismissed Show dismissed Hide dismissed
}
33 changes: 27 additions & 6 deletions core/config/backend_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ type Diffusers struct {
ControlNet string `yaml:"control_net"`
}

// LLMConfig is a struct that holds the configuration that are
// generic for most of the LLM backends.
type LLMConfig struct {
SystemPrompt string `yaml:"system_prompt"`
TensorSplit string `yaml:"tensor_split"`
Expand Down Expand Up @@ -144,20 +146,39 @@ type LLMConfig struct {
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
}

// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
type AutoGPTQ struct {
ModelBaseName string `yaml:"model_base_name"`
Device string `yaml:"device"`
Triton bool `yaml:"triton"`
UseFastTokenizer bool `yaml:"use_fast_tokenizer"`
}

// TemplateConfig is a struct that holds the configuration of the templating system
type TemplateConfig struct {
Chat string `yaml:"chat"`
ChatMessage string `yaml:"chat_message"`
Completion string `yaml:"completion"`
Edit string `yaml:"edit"`
Functions string `yaml:"function"`
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`
// Chat is the template used in the chat completion endpoint
Chat string `yaml:"chat"`

// ChatMessage is the template used for chat messages
ChatMessage string `yaml:"chat_message"`

// Completion is the template used for completion requests
Completion string `yaml:"completion"`

// Edit is the template used for edit completion requests
Edit string `yaml:"edit"`

// Functions is the template used when tools are present in the client requests
Functions string `yaml:"function"`

// UseTokenizerTemplate is a flag that indicates if the tokenizer template should be used.
// Note: this is mostly consumed for backends such as vllm and transformers
// that can use the tokenizers specified in the JSON config files of the models
UseTokenizerTemplate bool `yaml:"use_tokenizer_template"`

// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
// It defaults to \n
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
}

func (c *BackendConfig) SetFunctionCallString(s string) {
Expand Down
7 changes: 6 additions & 1 deletion core/http/endpoints/openai/chat.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
mess = append(mess, content)
}

predInput = strings.Join(mess, "\n")
joinCharacter := "\n"
if config.TemplateConfig.JoinChatMessagesByCharacter != nil {
joinCharacter = *config.TemplateConfig.JoinChatMessagesByCharacter
}

predInput = strings.Join(mess, joinCharacter)
log.Debug().Msgf("Prompt (before templating): %s", predInput)

templateFile := ""
Expand Down
6 changes: 5 additions & 1 deletion pkg/assets/extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
"path/filepath"
)

func ResolvePath(dir string, paths ...string) string {
return filepath.Join(append([]string{dir, "backend-assets"}, paths...)...)
}

func ExtractFiles(content embed.FS, extractDir string) error {
// Create the target directory if it doesn't exist
err := os.MkdirAll(extractDir, 0750)
Expand Down Expand Up @@ -39,7 +43,7 @@
}

// Create the file in the target directory
err = os.WriteFile(targetFile, fileData, 0600)
err = os.WriteFile(targetFile, fileData, 0700)
Dismissed Show dismissed Hide dismissed
if err != nil {
return fmt.Errorf("failed to write file: %v", err)
}
Expand Down
Loading