|
| 1 | +#!/bin/bash |
| 2 | + |
| 3 | +# This script aims to tune the best server parameter combinations to maximize throughput for given requirement. |
| 4 | +# The current server parameter combination is max_num_seqs and max_num_batched_tokens |
| 5 | +# It also supports additional requirement: e2e latency and prefix cache. |
| 6 | + |
| 7 | +# Pre-requisite: |
| 8 | +# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version. |
| 9 | +# 2. If the model is customized, replace the MODEL's config with the customized config. |
| 10 | +# 3. Set variables (ALL REQUIRED) |
| 11 | +# BASE: your directory for vllm repo |
| 12 | +# MODEL: the model served by vllm |
| 13 | +# DOWNLOAD_DIR: directory to download and load model weights. |
| 14 | +# INPUT_LEN: request input len |
| 15 | +# OUTPUT_LEN: request output len |
| 16 | +# MIN_CACHE_HIT_PCT: prefix cache rate |
| 17 | +# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000 |
| 18 | +# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens. |
| 19 | +# 5. The final result will be saved in RESULT file. |
| 20 | + |
| 21 | + |
| 22 | +# Example use cases |
| 23 | +# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput? |
| 24 | +# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000 |
| 25 | +# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter? |
| 26 | +# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500 |
| 27 | +# 3. If we want to reach 60% prefix cache, what's the best server parameter? |
| 28 | +# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500 |
| 29 | + |
| 30 | +TAG=$(date +"%Y_%m_%d_%H_%M") |
| 31 | +BASE="" |
| 32 | +MODEL="meta-llama/Llama-3.1-8B-Instruct" |
| 33 | +DOWNLOAD_DIR="" |
| 34 | +INPUT_LEN=4000 |
| 35 | +OUTPUT_LEN=16 |
| 36 | +MIN_CACHE_HIT_PCT_PCT=0 |
| 37 | +MAX_LATENCY_ALLOWED_MS=100000000000 |
| 38 | + |
| 39 | +LOG_FOLDER="$BASE/auto-benchmark/$TAG" |
| 40 | +RESULT="$LOG_FOLDER/result.txt" |
| 41 | + |
| 42 | +echo "result file$ $RESULT" |
| 43 | +echo "model: $MODEL" |
| 44 | +echo |
| 45 | + |
| 46 | +rm -rf $LOG_FOLDER |
| 47 | +mkdir -p $LOG_FOLDER |
| 48 | + |
| 49 | +cd "$BASE/vllm" |
| 50 | +# create sonnet-4x.txt so that we can sample 2048 tokens for input |
| 51 | +echo "" > benchmarks/sonnet_4x.txt |
| 52 | +for _ in {1..4} |
| 53 | +do |
| 54 | +cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt |
| 55 | +done |
| 56 | + |
| 57 | +pip install datasets |
| 58 | + |
| 59 | +current_hash=$(git rev-parse HEAD) |
| 60 | +echo "hash:$current_hash" >> "$RESULT" |
| 61 | +echo "current_hash: $current_hash" |
| 62 | + |
| 63 | +best_throughput=0 |
| 64 | +best_max_num_seqs=0 |
| 65 | +best_num_batched_tokens=0 |
| 66 | +best_goodput=0 |
| 67 | +run_benchmark() { |
| 68 | + local max_num_seqs=$1 |
| 69 | + local max_num_batched_tokens=$2 |
| 70 | + echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens" |
| 71 | + local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt" |
| 72 | + echo "vllm_log: $vllm_log" |
| 73 | + echo |
| 74 | + rm -f $vllm_log |
| 75 | + |
| 76 | + # start the server |
| 77 | + VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \ |
| 78 | + --disable-log-requests \ |
| 79 | + --port 8004 \ |
| 80 | + --gpu-memory-utilization 0.98 \ |
| 81 | + --max-num-seqs $max_num_seqs \ |
| 82 | + --max-num-batched-tokens $max_num_batched_tokens \ |
| 83 | + --tensor-parallel-size 1 \ |
| 84 | + --enable-prefix-caching \ |
| 85 | + --load-format dummy \ |
| 86 | + --download-dir $DOWNLOAD_DIR \ |
| 87 | + --max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 & |
| 88 | + echo "wait for 10 minutes.." |
| 89 | + echo |
| 90 | + # wait for 10 minutes... |
| 91 | + server_started=0 |
| 92 | + for i in {1..60}; do |
| 93 | + if grep -Fq "Application startup complete" "$vllm_log"; then |
| 94 | + echo "Application started" |
| 95 | + server_started=1 |
| 96 | + break |
| 97 | + else |
| 98 | + # echo "wait for 10 seconds..." |
| 99 | + sleep 10 |
| 100 | + fi |
| 101 | + done |
| 102 | + |
| 103 | + if (( ! server_started )); then |
| 104 | + echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log" |
| 105 | + echo "pkill -f vllm" |
| 106 | + echo |
| 107 | + pkill vllm |
| 108 | + sleep 10 |
| 109 | + return 1 |
| 110 | + fi |
| 111 | + |
| 112 | + echo "run benchmark test..." |
| 113 | + echo |
| 114 | + meet_latency_requirement=0 |
| 115 | + # get a basic qps by using request-rate inf |
| 116 | + bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt" |
| 117 | + prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 )) |
| 118 | + python benchmarks/benchmark_serving.py \ |
| 119 | + --backend vllm \ |
| 120 | + --model $MODEL \ |
| 121 | + --dataset-name sonnet \ |
| 122 | + --dataset-path benchmarks/sonnet_4x.txt \ |
| 123 | + --sonnet-input-len $INPUT_LEN \ |
| 124 | + --sonnet-output-len $OUTPUT_LEN \ |
| 125 | + --ignore-eos \ |
| 126 | + --disable-tqdm \ |
| 127 | + --request-rate inf \ |
| 128 | + --percentile-metrics ttft,tpot,itl,e2el \ |
| 129 | + --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ |
| 130 | + --num-prompts 100 \ |
| 131 | + --sonnet-prefix-len $prefix_len \ |
| 132 | + --port 8004 > "$bm_log" |
| 133 | + through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') |
| 134 | + e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') |
| 135 | + goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') |
| 136 | + |
| 137 | + if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then |
| 138 | + meet_latency_requirement=1 |
| 139 | + fi |
| 140 | + |
| 141 | + if (( ! meet_latency_requirement )); then |
| 142 | + # start from request-rate as int(through_put) + 1 |
| 143 | + request_rate=$((${through_put%.*} + 1)) |
| 144 | + while ((request_rate > 0)); do |
| 145 | + # clear prefix cache |
| 146 | + curl -X POST http://0.0.0.0:8004/reset_prefix_cache |
| 147 | + sleep 5 |
| 148 | + bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt" |
| 149 | + python benchmarks/benchmark_serving.py \ |
| 150 | + --backend vllm \ |
| 151 | + --model $MODEL \ |
| 152 | + --dataset-name sonnet \ |
| 153 | + --dataset-path benchmarks/sonnet_4x.txt \ |
| 154 | + --sonnet-input-len $INPUT_LEN \ |
| 155 | + --sonnet-output-len $OUTPUT_LEN \ |
| 156 | + --ignore_eos \ |
| 157 | + --disable-tqdm \ |
| 158 | + --request-rate $request_rate \ |
| 159 | + --percentile-metrics ttft,tpot,itl,e2el \ |
| 160 | + --goodput e2el:$MAX_LATENCY_ALLOWED_MS \ |
| 161 | + --num-prompts 100 \ |
| 162 | + --sonnet-prefix-len $prefix_len \ |
| 163 | + --port 8004 > "$bm_log" |
| 164 | + through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') |
| 165 | + e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}') |
| 166 | + goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g') |
| 167 | + if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then |
| 168 | + meet_latency_requirement=1 |
| 169 | + break |
| 170 | + fi |
| 171 | + request_rate=$((request_rate-1)) |
| 172 | + done |
| 173 | + fi |
| 174 | + # write the results and update the best result. |
| 175 | + if ((meet_latency_requirement)); then |
| 176 | + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" |
| 177 | + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT" |
| 178 | + if (( $(echo "$through_put > $best_throughput" | bc -l) )); then |
| 179 | + best_throughput=$through_put |
| 180 | + best_max_num_seqs=$max_num_seqs |
| 181 | + best_num_batched_tokens=$max_num_batched_tokens |
| 182 | + best_goodput=$goodput |
| 183 | + fi |
| 184 | + else |
| 185 | + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" |
| 186 | + echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT" |
| 187 | + fi |
| 188 | + |
| 189 | + echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" |
| 190 | + |
| 191 | + echo "pkill -f vllm" |
| 192 | + echo |
| 193 | + pkill vllm |
| 194 | + sleep 10 |
| 195 | + rm -f $vllm_log |
| 196 | + printf '=%.0s' $(seq 1 20) |
| 197 | + return 0 |
| 198 | +} |
| 199 | + |
| 200 | + |
| 201 | +num_seqs_list="128 256" |
| 202 | +num_batched_tokens_list="512 1024 2048 4096" |
| 203 | +for num_seqs in $num_seqs_list; do |
| 204 | + for num_batched_tokens in $num_batched_tokens_list; do |
| 205 | + run_benchmark $num_seqs $num_batched_tokens |
| 206 | + exit 0 |
| 207 | + done |
| 208 | +done |
| 209 | +echo "finish permutations" |
| 210 | +echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" |
| 211 | +echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT" |
| 212 | + |
0 commit comments