Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Local gpt v1 #771

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# syntax=docker/dockerfile:1
# Build as `docker build . -t localgpt`, requires BuildKit.
# Run as `docker run -it --mount src="$HOME/.cache",target=/root/.cache,type=bind --gpus=all localgpt`, requires Nvidia container toolkit.

FROM nvidia/cuda:11.7.1-runtime-ubuntu22.04
RUN apt-get update && apt-get install -y software-properties-common
RUN apt-get install -y g++-11 make python3 python-is-python3 pip
# only copy what's needed at every step to optimize layer cache
COPY ./requirements.txt .
# use BuildKit cache mount to drastically reduce redownloading from pip on repeated builds
RUN --mount=type=cache,target=/root/.cache CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install --timeout 100 -r requirements.txt llama-cpp-python==0.1.83
COPY SOURCE_DOCUMENTS ./SOURCE_DOCUMENTS
COPY ingest.py constants.py ./
# Docker BuildKit does not support GPU during *docker build* time right now, only during *docker run*.
# See <https://github.com/moby/buildkit/issues/1436>.
# If this changes in the future you can `docker build --build-arg device_type=cuda . -t localgpt` (+GPU argument to be determined).
ARG device_type=cpu
RUN --mount=type=cache,target=/root/.cache python ingest.py --device_type $device_type
COPY . .
ENV device_type=cuda
CMD python run_localGPT.py --device_type $device_type
329 changes: 194 additions & 135 deletions README.md

Large diffs are not rendered by default.

Binary file removed SOURCE_DOCUMENTS/constitution.pdf
Binary file not shown.
216 changes: 202 additions & 14 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,224 @@
from chromadb.config import Settings

# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader
from langchain.document_loaders import CSVLoader, PyPDFLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader, UnstructuredODTLoader, UnstructuredHTMLLoader, UnstructuredCSVLoader
from langchain.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader, UnstructuredPDFLoader

# load_dotenv()
ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))

# Define the folder for storing database
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"

PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"
PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/chroma"

RESET_DB = True

MODELS_PATH = "./models"

# Can be changed to a specific number
INGEST_THREADS = os.cpu_count() or 8
INGEST_THREADS = (os.cpu_count()-2)

# Define the Chroma settings
CHROMA_SETTINGS = Settings(
chroma_db_impl="duckdb+parquet", persist_directory=PERSIST_DIRECTORY, anonymized_telemetry=False
anonymized_telemetry=False,
is_persistent=True,
)

# Context Window and Max New Tokens
CONTEXT_WINDOW_SIZE = 4096
MAX_NEW_TOKENS = int(CONTEXT_WINDOW_SIZE/4) # int(CONTEXT_WINDOW_SIZE/4)

#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing

N_GPU_LAYERS = 23 # Llama-2-70B has 83 layers
N_BATCH = 512

### From experimenting with the Llama-2-7B-Chat-GGML model on 8GB VRAM, these values work:
# N_GPU_LAYERS = 20
# N_BATCH = 512


# https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
DOCUMENT_MAP = {
".txt": TextLoader,
".py": TextLoader,
".pdf": PDFMinerLoader,
".csv": CSVLoader,
".xls": UnstructuredExcelLoader,
".xlxs": UnstructuredExcelLoader,
".txt": TextLoader, # good
".md": TextLoader, # good
".py": TextLoader, # good
# ".pdf": PyPDFLoader,
".pdf": UnstructuredPDFLoader, # good
".csv": CSVLoader, # reads stuff in, but needs a lot more work.
".xls": UnstructuredExcelLoader, # not tested, likely needs same thing as csv
".xlsx": UnstructuredExcelLoader, # not tested, likely needs same thing as csv
".docx": Docx2txtLoader, # good
".doc": Docx2txtLoader, # good
".odt": UnstructuredODTLoader, # freezes UI?
".html": UnstructuredHTMLLoader, # good
".jpg": TextLoader,
".png": TextLoader,
}

# Default Instructor Model
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"
# You can also choose a smaller model, don't forget to change HuggingFaceInstructEmbeddings
# to HuggingFaceEmbeddings in both ingest.py and run_localGPT.py
# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
# EMBEDDING_MODEL_NAME = "hkunlp/instructor-large" # Uses 1.5 GB of VRAM (High Accuracy with lower VRAM usage)

DEVICE_TYPE = "cuda" # "cpu"

####
#### OTHER EMBEDDING MODEL OPTIONS
####
EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"
# EMBEDDING_MODEL_NAME = "hkunlp/instructor-xl" # Uses 5 GB of VRAM (Most Accurate of all models)
# EMBEDDING_MODEL_NAME = "intfloat/e5-large-v2" # Uses 1.5 GB of VRAM (A little less accurate than instructor-large)
# EMBEDDING_MODEL_NAME = "intfloat/e5-base-v2" # Uses 0.5 GB of VRAM (A good model for lower VRAM GPUs)
# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" # Uses 0.2 GB of VRAM (Less accurate but fastest - only requires 150mb of vram)

####
#### MULTILINGUAL EMBEDDING MODELS
####

# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-large" # Uses 2.5 GB of VRAM
# EMBEDDING_MODEL_NAME = "intfloat/multilingual-e5-base" # Uses 1.2 GB of VRAM


#### SELECT AN OPEN SOURCE LLM (LARGE LANGUAGE MODEL)
# Select the Model ID and model_basename
# load the LLM for generating Natural Language responses

#### GPU VRAM Memory required for LLM Models (ONLY) by Billion Parameter value (B Model)
#### Does not include VRAM used by Embedinstructor (GPTQ 8bit) (GPTQ 4bit)
#### 7b 28 GB 14 GB 7 GB - 9 GB 3.5 GB - 5 GB
#### 13b 52 GB 26 GB 13 GB - 15 GB 6.5 GB - 8 GB
#### 32b 130 GB 65 GB 32.5 GB - 35 GB 16.25 GB - 19 GB
#### 65b 260.8 GB 130.4 GB 65.2 GB - 67 GB 32.6 GB - - 35 GB

# MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
# MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"

####
#### (FOR GGUF MODELS)
####

# MODEL_ID = "TheBloke/Llama-2-13b-Chat-GGUF"
# MODEL_BASENAME = "llama-2-13b-chat.Q4_K_M.gguf"

# MODEL_ID = "TheBloke/Llama-2-7b-Chat-GGUF"
# MODEL_BASENAME = "llama-2-7b-chat.Q4_K_M.gguf"

# MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
# MODEL_BASENAME = "mistral-7b-instruct-v0.1.Q8_0.gguf"

# MODEL_ID = "TheBloke/Llama-2-70b-Chat-GGUF"
# MODEL_BASENAME = "llama-2-70b-chat.Q4_K_M.gguf"

# MODEL_ID = "TheBloke/dolphin-2.2.1-mistral-7B-GGUF"
# MODEL_BASENAME = "dolphin-2.2.1-mistral-7b.Q4_K_M.gguf"

# MODEL_ID = "TheBloke/Mixtral-Fusion-4x7B-Instruct-v0.1-GGUF"
# MODEL_BASENAME = "mixtral-fusion-4x7b-instruct-v0.1.Q4_K_M.gguf"

# MODEL_ID = "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF"
# MODEL_BASENAME = "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf"

# MODEL_ID = "TheBloke/dolphin-2.5-mixtral-8x7b-GGUF"
# MODEL_BASENAME = "dolphin-2.5-mixtral-8x7b.Q4_K_M.gguf"

####
#### (FOR HF MODELS)
####

# MODEL_ID = "NousResearch/Llama-2-7b-chat-hf"
# MODEL_BASENAME = None
# MODEL_ID = "TheBloke/vicuna-7B-1.1-HF"
# MODEL_BASENAME = None
# MODEL_ID = "TheBloke/Wizard-Vicuna-7B-Uncensored-HF"
# MODEL_ID = "TheBloke/guanaco-7B-HF"
# MODEL_ID = 'NousResearch/Nous-Hermes-13b' # Requires ~ 23GB VRAM. Using STransformers
# alongside will 100% create OOM on 24GB cards.
# llm = load_model(device_type, model_id=model_id)

####
#### (FOR GPTQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
####

##### 48GB VRAM Graphics Cards (RTX 6000, RTX A6000 and other 48GB VRAM GPUs) #####

### 65b GPTQ LLM Models for 48GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
# MODEL_ID = "TheBloke/guanaco-65B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Airoboros-65B-GPT4-2.0-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/gpt4-alpaca-lora_mlp-65B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Upstage-Llama1-65B-Instruct-GPTQ"
# MODEL_BASENAME = "model.safetensors"

##### 24GB VRAM Graphics Cards (RTX 3090 - RTX 4090 (35% Faster) - RTX A5000 - RTX A5500) #####

### 13b GPTQ Models for 24GB GPUs (*** With best embedding model: hkunlp/instructor-xl ***)
# MODEL_ID = "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ"
# MODEL_BASENAME = "Wizard-Vicuna-13B-Uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
# MODEL_ID = "TheBloke/vicuna-13B-v1.5-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Nous-Hermes-13B-GPTQ"
# MODEL_BASENAME = "nous-hermes-13b-GPTQ-4bit-128g.no-act.order"
# MODEL_ID = "TheBloke/WizardLM-13B-V1.2-GPTQ"
# MODEL_BASENAME = "gptq_model-4bit-128g.safetensors"

### 30b GPTQ Models for 24GB GPUs (*** Requires using intfloat/e5-base-v2 instead of hkunlp/instructor-large as embedding model ***)
# MODEL_ID = "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ"
# MODEL_BASENAME = "Wizard-Vicuna-30B-Uncensored-GPTQ-4bit--1g.act.order.safetensors"
# MODEL_ID = "TheBloke/WizardLM-30B-Uncensored-GPTQ"
# MODEL_BASENAME = "WizardLM-30B-Uncensored-GPTQ-4bit.act-order.safetensors"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/WizardLM-7B-uncensored-GPTQ"
# MODEL_BASENAME = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"
# MODEL_ID = "TheBloke/wizardLM-7B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/WizaUnstructuredHTMLLoaderrdLM-1.0-Uncensored-Llama2-13B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Skywork-13B-base-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Llama-2-7b-Chat-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Xwin-MLewd-7B-V0.2-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/vicuna-7B-v1.5-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Llama-2-13b-Chat-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/platypus-yi-34b-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/WizardLM-1.0-Uncensored-Llama2-13B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/Orca-2-7B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/NeuralHermes-2.5-Mistral-7B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/OpenHermes-2.5-neural-chat-7B-v3-2-7B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/dolphin-2_2-yi-34b-GPTQ"
# MODEL_BASENAME = "model.safetensors"
# MODEL_ID = "TheBloke/dolphin-2.2.1-mistral-7B-GPTQ"
# MODEL_BASENAME = "model.safetensors"
MODEL_ID = "TheBloke/dolphin-2.6-mistral-7B-dpo-GPTQ"
MODEL_BASENAME = "model.safetensors"

####
#### (FOR GGML) (Quantized cpu+gpu+mps) models - check if they support llama.cpp
####

# MODEL_ID = "TheBloke/wizard-vicuna-13B-GGML"
# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q4_0.bin"
# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q6_K.bin"
# MODEL_BASENAME = "wizard-vicuna-13B.ggmlv3.q2_K.bin"
# MODEL_ID = "TheBloke/orca_mini_3B-GGML"
# MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"

####
#### (FOR AWQ QUANTIZED) Select a llm model based on your GPU and VRAM GB. Does not include Embedding Models VRAM usage.
### (*** MODEL_BASENAME is not actually used but have to contain .awq so the correct model loading is used ***)
### (*** Compute capability 7.5 (sm75) and CUDA Toolkit 11.8+ are required ***)
####
# MODEL_ID = "TheBloke/Llama-2-7B-Chat-AWQ"
# MODEL_BASENAME = "model.safetensors.awq"
91 changes: 91 additions & 0 deletions crawl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import os
import shutil
import click
import subprocess

from constants import (
DOCUMENT_MAP,
SOURCE_DIRECTORY
)

def logToFile(logentry):
file1 = open("crawl.log","a")
file1.write(logentry + "\n")
file1.close()
print(logentry + "\n")

@click.command()
@click.option(
"--device_type",
default="cuda",
type=click.Choice(
[
"cpu",
"cuda",
"ipu",
"xpu",
"mkldnn",
"opengl",
"opencl",
"ideep",
"hip",
"ve",
"fpga",
"ort",
"xla",
"lazy",
"vulkan",
"mps",
"meta",
"hpu",
"mtia",
],
),
help="Device to run on. (Default is cuda)",
)
@click.option(
"--landing_directory",
default="./LANDING_DOCUMENTS"
)
@click.option(
"--processed_directory",
default="./PROCESSED_DOCUMENTS"
)
@click.option(
"--error_directory",
default="./ERROR_DOCUMENTS"
)
@click.option(
"--unsupported_directory",
default="./UNSUPPORTED_DOCUMENTS"
)

def main(device_type, landing_directory, processed_directory, error_directory, unsupported_directory):
paths = []

os.makedirs(processed_directory, exist_ok=True)
os.makedirs(error_directory, exist_ok=True)
os.makedirs(unsupported_directory, exist_ok=True)

for root, _, files in os.walk(landing_directory):
for file_name in files:
file_extension = os.path.splitext(file_name)[1]
short_filename = os.path.basename(file_name)

if not os.path.isdir(root + "/" + file_name):
if file_extension in DOCUMENT_MAP.keys():
shutil.move(root + "/" + file_name, SOURCE_DIRECTORY+ "/" + short_filename)
logToFile("START: " + root + "/" + short_filename)
process = subprocess.Popen("python ingest.py --device_type=" + device_type, shell=True, stdout=subprocess.PIPE)
process.wait()
if process.returncode > 0:
shutil.move(SOURCE_DIRECTORY + "/" + short_filename, error_directory + "/" + short_filename)
logToFile("ERROR: " + root + "/" + short_filename)
else:
logToFile("VALID: " + root + "/" + short_filename)
shutil.move(SOURCE_DIRECTORY + "/" + short_filename, processed_directory+ "/" + short_filename)
else:
shutil.move(root + "/" + file_name, unsupported_directory+ "/" + short_filename)

if __name__ == "__main__":
main()
Loading