-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllm_benchmark.py
More file actions
177 lines (145 loc) · 6.26 KB
/
llm_benchmark.py
File metadata and controls
177 lines (145 loc) · 6.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import time
import json
import gc
import signal
from typing import Optional
from datetime import datetime
import ollama
import psutil
class TimeoutException(Exception):
pass
def timeout(seconds):
def decorator(func):
def _handle_timeout(signum, frame):
raise TimeoutException()
def wrapper(*args, **kwargs):
signal.signal(signal.SIGALRM, _handle_timeout)
signal.alarm(seconds)
try:
result = func(*args, **kwargs)
finally:
signal.alarm(0)
return result
return wrapper
return decorator
def run_with_timeout(prompt: str, model_name: str, timeout_seconds: int = 45):
"""Run LLM inference with a timeout"""
try:
@timeout(timeout_seconds)
def generate():
time.sleep(1) # Reduced delay
full_response = ""
start_time = time.time()
# Split prompt into chunks for better memory management
chunk_size = 64 # Align with model's context window
chunks = [prompt[i:i+chunk_size] for i in range(0, len(prompt), chunk_size)]
for prompt_chunk in chunks:
# Process each chunk with streaming
for response_chunk in ollama.generate(model=model_name,
prompt=prompt_chunk,
stream=True):
if response_chunk and 'response' in response_chunk:
full_response += response_chunk['response']
# Print progress indicator
print("▪", end="", flush=True)
time.sleep(0.1) # Brief pause between chunks
print("\n", end="") # New line after progress dots
return full_response
return generate()
except TimeoutException:
print(f"\n⚠️ Inference timed out after {timeout_seconds} seconds")
return None
except Exception as e:
print(f"\n⚠️ Error during inference: {str(e)}")
return None
def warm_up_model(model_name: str):
"""Perform warm-up runs to stabilize performance"""
print("\n🔄 Performing warm-up runs...")
warm_up_prompts = ["Hi", "Hello", "Test"]
for prompt in warm_up_prompts:
print(f"Warming up: {prompt}")
run_with_timeout(prompt, model_name, timeout_seconds=20)
time.sleep(1)
print("✓ Warm-up complete\n")
def benchmark_model(model_name: str, num_runs=2):
print(f"\n🔄 Benchmarking {model_name}...")
# Test prompts
test_prompts = [
# Basic Prompts
"Hi.",
"What is your name?",
# Analytical Tasks (Simplified)
"Explain what a binary search is in one sentence.",
"Name three renewable energy sources.",
# Creative Tasks
"Write a haiku about coding.",
"Describe a sunset on Mars in one sentence.",
# Logic and Reasoning (Simplified)
"Calculate: 60 mph × 2.5 hours = ?",
"Is this valid logic: All birds fly, penguins are birds, so penguins fly?",
# Knowledge Integration (Focused)
"What is the main difference between photosynthesis and respiration?",
"Give one example of supply and demand.",
# Edge Cases (Simplified)
"List 5 random numbers between 1-100.",
"Translate 'Hello' into Spanish and French."
]
results = []
for i, prompt in enumerate(test_prompts, 1):
print(f"\n📝 Test {i}/{len(test_prompts)}: {prompt}")
prompt_results = []
for run in range(num_runs):
print(f"\n🔄 Run {run + 1}/{num_runs}")
print("Starting inference...")
start_time = time.time()
response = run_with_timeout(prompt, model_name)
if response is not None:
end_time = time.time()
duration = end_time - start_time
tokens = len(response.split())
tokens_per_second = tokens / duration if duration > 0 else 0
result = {
"prompt": prompt,
"response": response,
"duration": duration,
"tokens": tokens,
"tokens_per_second": tokens_per_second
}
prompt_results.append(result)
print(f"✓ Response: {response}")
print(f"⏱️ Time: {duration:.2f}s")
print(f"📊 Tokens/sec: {tokens_per_second:.2f}")
else:
print("Skipping this run due to error")
# Force garbage collection
gc.collect()
if prompt_results:
avg_duration = sum(r["duration"] for r in prompt_results) / len(prompt_results)
avg_tokens = sum(r["tokens"] for r in prompt_results) / len(prompt_results)
avg_tokens_per_second = sum(r["tokens_per_second"] for r in prompt_results) / len(prompt_results)
results.append({
"prompt": prompt,
"avg_duration": avg_duration,
"avg_tokens": avg_tokens,
"avg_tokens_per_second": avg_tokens_per_second,
"runs": prompt_results
})
print(f"\n📊 Average for prompt:")
print(f"⏱️ Time: {avg_duration:.2f}s")
print(f"📊 Tokens/sec: {avg_tokens_per_second:.2f}")
# Save results to file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"benchmark_results_{model_name}_{timestamp}.json"
with open(results_file, "w") as f:
json.dump(results, f, indent=2)
print(f"\n💾 Results saved to {results_file}")
if __name__ == "__main__":
# Get total system memory
total_memory = psutil.virtual_memory().total / (1024**3) # Convert to GB
# Use custom model with optimized parameters
model_name = "deepseek-r1:8b-custom"
print(f"\n🔄 Benchmarking {model_name}...")
# Perform warm-up runs first
warm_up_model(model_name)
# Run the actual benchmarks
benchmark_model(model_name)