Skip to content

Commit 9b70e2b

Browse files
authored
[Misc][Tools][Benchmark] Publish script to auto tune server parameters (#17207)
Signed-off-by: Chenyaaang <[email protected]>
1 parent 173daac commit 9b70e2b

File tree

1 file changed

+212
-0
lines changed

1 file changed

+212
-0
lines changed

benchmarks/auto_tune.sh

+212
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
#!/bin/bash
2+
3+
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
4+
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
5+
# It also supports additional requirement: e2e latency and prefix cache.
6+
7+
# Pre-requisite:
8+
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
9+
# 2. If the model is customized, replace the MODEL's config with the customized config.
10+
# 3. Set variables (ALL REQUIRED)
11+
# BASE: your directory for vllm repo
12+
# MODEL: the model served by vllm
13+
# DOWNLOAD_DIR: directory to download and load model weights.
14+
# INPUT_LEN: request input len
15+
# OUTPUT_LEN: request output len
16+
# MIN_CACHE_HIT_PCT: prefix cache rate
17+
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
18+
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
19+
# 5. The final result will be saved in RESULT file.
20+
21+
22+
# Example use cases
23+
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
24+
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
25+
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
26+
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
27+
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
28+
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
29+
30+
TAG=$(date +"%Y_%m_%d_%H_%M")
31+
BASE=""
32+
MODEL="meta-llama/Llama-3.1-8B-Instruct"
33+
DOWNLOAD_DIR=""
34+
INPUT_LEN=4000
35+
OUTPUT_LEN=16
36+
MIN_CACHE_HIT_PCT_PCT=0
37+
MAX_LATENCY_ALLOWED_MS=100000000000
38+
39+
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
40+
RESULT="$LOG_FOLDER/result.txt"
41+
42+
echo "result file$ $RESULT"
43+
echo "model: $MODEL"
44+
echo
45+
46+
rm -rf $LOG_FOLDER
47+
mkdir -p $LOG_FOLDER
48+
49+
cd "$BASE/vllm"
50+
# create sonnet-4x.txt so that we can sample 2048 tokens for input
51+
echo "" > benchmarks/sonnet_4x.txt
52+
for _ in {1..4}
53+
do
54+
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
55+
done
56+
57+
pip install datasets
58+
59+
current_hash=$(git rev-parse HEAD)
60+
echo "hash:$current_hash" >> "$RESULT"
61+
echo "current_hash: $current_hash"
62+
63+
best_throughput=0
64+
best_max_num_seqs=0
65+
best_num_batched_tokens=0
66+
best_goodput=0
67+
run_benchmark() {
68+
local max_num_seqs=$1
69+
local max_num_batched_tokens=$2
70+
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
71+
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
72+
echo "vllm_log: $vllm_log"
73+
echo
74+
rm -f $vllm_log
75+
76+
# start the server
77+
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
78+
--disable-log-requests \
79+
--port 8004 \
80+
--gpu-memory-utilization 0.98 \
81+
--max-num-seqs $max_num_seqs \
82+
--max-num-batched-tokens $max_num_batched_tokens \
83+
--tensor-parallel-size 1 \
84+
--enable-prefix-caching \
85+
--load-format dummy \
86+
--download-dir $DOWNLOAD_DIR \
87+
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
88+
echo "wait for 10 minutes.."
89+
echo
90+
# wait for 10 minutes...
91+
server_started=0
92+
for i in {1..60}; do
93+
if grep -Fq "Application startup complete" "$vllm_log"; then
94+
echo "Application started"
95+
server_started=1
96+
break
97+
else
98+
# echo "wait for 10 seconds..."
99+
sleep 10
100+
fi
101+
done
102+
103+
if (( ! server_started )); then
104+
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
105+
echo "pkill -f vllm"
106+
echo
107+
pkill vllm
108+
sleep 10
109+
return 1
110+
fi
111+
112+
echo "run benchmark test..."
113+
echo
114+
meet_latency_requirement=0
115+
# get a basic qps by using request-rate inf
116+
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
117+
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
118+
python benchmarks/benchmark_serving.py \
119+
--backend vllm \
120+
--model $MODEL \
121+
--dataset-name sonnet \
122+
--dataset-path benchmarks/sonnet_4x.txt \
123+
--sonnet-input-len $INPUT_LEN \
124+
--sonnet-output-len $OUTPUT_LEN \
125+
--ignore-eos \
126+
--disable-tqdm \
127+
--request-rate inf \
128+
--percentile-metrics ttft,tpot,itl,e2el \
129+
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
130+
--num-prompts 100 \
131+
--sonnet-prefix-len $prefix_len \
132+
--port 8004 > "$bm_log"
133+
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
134+
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
135+
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
136+
137+
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
138+
meet_latency_requirement=1
139+
fi
140+
141+
if (( ! meet_latency_requirement )); then
142+
# start from request-rate as int(through_put) + 1
143+
request_rate=$((${through_put%.*} + 1))
144+
while ((request_rate > 0)); do
145+
# clear prefix cache
146+
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
147+
sleep 5
148+
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
149+
python benchmarks/benchmark_serving.py \
150+
--backend vllm \
151+
--model $MODEL \
152+
--dataset-name sonnet \
153+
--dataset-path benchmarks/sonnet_4x.txt \
154+
--sonnet-input-len $INPUT_LEN \
155+
--sonnet-output-len $OUTPUT_LEN \
156+
--ignore_eos \
157+
--disable-tqdm \
158+
--request-rate $request_rate \
159+
--percentile-metrics ttft,tpot,itl,e2el \
160+
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
161+
--num-prompts 100 \
162+
--sonnet-prefix-len $prefix_len \
163+
--port 8004 > "$bm_log"
164+
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
165+
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
166+
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
167+
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
168+
meet_latency_requirement=1
169+
break
170+
fi
171+
request_rate=$((request_rate-1))
172+
done
173+
fi
174+
# write the results and update the best result.
175+
if ((meet_latency_requirement)); then
176+
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
177+
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
178+
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
179+
best_throughput=$through_put
180+
best_max_num_seqs=$max_num_seqs
181+
best_num_batched_tokens=$max_num_batched_tokens
182+
best_goodput=$goodput
183+
fi
184+
else
185+
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
186+
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
187+
fi
188+
189+
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
190+
191+
echo "pkill -f vllm"
192+
echo
193+
pkill vllm
194+
sleep 10
195+
rm -f $vllm_log
196+
printf '=%.0s' $(seq 1 20)
197+
return 0
198+
}
199+
200+
201+
num_seqs_list="128 256"
202+
num_batched_tokens_list="512 1024 2048 4096"
203+
for num_seqs in $num_seqs_list; do
204+
for num_batched_tokens in $num_batched_tokens_list; do
205+
run_benchmark $num_seqs $num_batched_tokens
206+
exit 0
207+
done
208+
done
209+
echo "finish permutations"
210+
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
211+
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
212+

0 commit comments

Comments
 (0)