-
Notifications
You must be signed in to change notification settings - Fork 1
/
benchmark.py
212 lines (199 loc) · 7.42 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from ctransformers import AutoModelForCausalLM
import time
import argparse
# Define a custom formatter class
class MyFormatter(argparse.HelpFormatter):
# Override the _format_action_invocation method
def _format_action_invocation(self, action):
# If the action has any option strings, use them
if action.option_strings:
# Join the option strings with commas and brackets
parts = [f"[{option_string}]" for option_string in action.option_strings]
return ", ".join(parts)
# Otherwise, use the default implementation
else:
return super()._format_action_invocation(action)
# Create an ArgumentParser object with the custom formatter class
parser = argparse.ArgumentParser("simple_example", formatter_class=MyFormatter)
parser.add_argument("-m", "-model", help="model to use", type=str, dest="model")
args = parser.parse_args()
ourmodel = args.model
#setting default variables
llm = None
last_request_time = time.time()
model_type="llama"
#how many gpu_layers to use (run benchmark.py to get optimal value)
gpu_layers=34
#how many tokens to generate
max_new_tokens=256
#temperature of the model
temperature=0.8
#repetition penalty
repetition_penalty=1.1
#how many threads to use (run benchmark.py to get optimal value)
threads=1
besttime=99999
bestlayers=0
rounds_without_improvement=0
batch_size=8
prompt = "### HUMAN:\nRewrite this using proper grammar more concise writing style:\n The Kilauea Volcano in Hawaii, one of the world's most active volcanoes, has started erupting, unleashing lava fountains inside its crater.\n###RESPONSE:\nRewrite:"
def benchmark_gpu_layers(layers):
#model type
#how many gpu_layers to use
global gpu_layers
global besttime
global bestlayers
global rounds_without_improvement
gpu_layers=layers + 1
global prompt
timer = time.time()
try:
print(f"\n#Next test! Trying to load model with {gpu_layers} layers")
load_model()
gettingAlpaca(prompt)
unload_model()
except RuntimeError:
print("#Oops! your GPU cant handle that much layers")
unload_model()
print(besttime)
print(bestlayers)
return gpu_layers
timetook = time.time() - timer
if timetook < besttime:
besttime = timetook
bestlayers=gpu_layers
rounds_without_improvement=0
print(f"####New best time so far: {besttime}")
print(f"####New best layers so far: {bestlayers}")
else:
rounds_without_improvement += 1
print(f"#Time taken for this run: {timetook}")
print(f"#Best time so far: {besttime}")
print(f"#Best layers so far: {bestlayers}")
print(f"#Rounds without improvement: {rounds_without_improvement}")
if rounds_without_improvement > 10:
return gpu_layers
benchmark_gpu_layers(gpu_layers)
def benchmark_threads(howmanythreads):
#model type
#how many threads to use
global threads
global besttime
global bestlayers
global rounds_without_improvement
threads=howmanythreads + 1
global prompt
timer = time.time()
try:
print(f"\n#Next test! Trying to load model with {threads} threads")
load_model()
gettingAlpaca(prompt)
unload_model()
except RuntimeError:
print("#Oops! We cant handle that much threads")
unload_model()
print(besttime)
print(bestlayers)
return threads
timetook = time.time() - timer
if timetook < besttime:
besttime = timetook
bestlayers=threads
rounds_without_improvement=0
print(f"####New best time so far: {besttime}")
print(f"####New best thread number so far: {bestlayers}")
else:
rounds_without_improvement += 1
print(f"#Time taken for this run: {timetook}")
print(f"#Best time so far: {besttime}")
print(f"#Best thread number so far: {bestlayers}")
print(f"#Rounds without improvement: {rounds_without_improvement}")
if rounds_without_improvement > 10:
return threads
benchmark_threads(threads)
def benchmark_batch_size(size_of_batch):
#model type
#how many threads to use
global threads
global besttime
global bestlayers
global rounds_without_improvement
batch_size=size_of_batch + 1
global prompt
timer = time.time()
try:
print(f"\n#Next test! Trying to load model with {batch_size} as batch_size")
load_model()
gettingAlpaca(prompt)
unload_model()
except RuntimeError:
print("#Oops! We cant handle that much batch_size")
unload_model()
print(besttime)
print(bestlayers)
return batch_size
timetook = time.time() - timer
if timetook < besttime:
besttime = timetook
bestlayers=batch_size
rounds_without_improvement=0
print(f"####New best time so far: {besttime}")
print(f"####New best batch_size so far: {bestlayers}")
else:
rounds_without_improvement += 1
print(f"#Time taken for this run: {timetook}")
print(f"#Best time so far: {besttime}")
print(f"#Best batch_size so far: {bestlayers}")
print(f"#Rounds without improvement: {rounds_without_improvement}")
if rounds_without_improvement > 150:
return batch_size
benchmark_batch_size(batch_size)
def gettingAlpaca(prompt):
#start the timer
start_time = time.time()
global last_request_time
# Update the last request time with the current time
last_request_time = time.time()
output = llm(prompt)
#get the tokens per second
num_tokens = len(output)
end_time = time.time()
elapsed_time = end_time - start_time
tokens_per_second = num_tokens / elapsed_time
print (output)
#print the tokens per second
print(f"\n\n#Tokens per second: {tokens_per_second}")
return output
def load_model():
#load the model
timer = time.time()
print("Loading model...")
global llm
llm = AutoModelForCausalLM.from_pretrained(ourmodel, model_type=model_type, gpu_layers=gpu_layers,batch_size=batch_size, seed=5,threads=threads, max_new_tokens=max_new_tokens,temperature=temperature,repetition_penalty=repetition_penalty)
print("Model loaded!")
print(f"Time taken to load model: {time.time() - timer}")
return llm
def unload_model():
# Unload the model
global llm
llm = None
print("Model unloaded!")
selector = input("Select benchmark option (First run GPU Layers, then Threads):\n1. Benchmark GPU layers\n2. Benchmark threads\n3. Benchmark Batch_size\n4. Exit\n")
if selector == "1":
print("Benchmarking GPU layers")
threads = int(input("How many threads do you want to use?\n"))
gpu_layers = int(input("How many GPU layers do you want to start with? \n"))
benchmark_gpu_layers(gpu_layers)
if selector == "2":
gpu_layers = int(input("How many GPU layers do you want to use? (Run GPU Layers benchmark for optimal performance\n"))
print("Benchmarking threads")
threads = int(input("How many threads do you want to start with? \n"))
benchmark_threads(threads)
if selector == "3":
gpu_layers = int(input("How many GPU layers do you want to use? (Run GPU Layers benchmark for optimal performance\n"))
threads = int(input("How many threads do you want to use? (Run Threads benchmark for optimal performance\n"))
print("Benchmarking batch_size")
batch_size = int(input("How many batch_size do you want to start with? \n"))
benchmark_batch_size(batch_size)
if selector == "4":
exit()