Skip to content

Update TensorRT-LLM backend #324

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,6 @@ python3 build.py --model_dir=./c-model/gpt2/4-gpu/ \
--paged_kv_cache \
--use_gemm_plugin float16 \
--remove_input_padding \
--use_layernorm_plugin float16 \
--hidden_act gelu \
--parallel_build \
--output_dir=engines/fp16/4-gpu
Expand Down Expand Up @@ -222,6 +221,7 @@ The following table shows the fields that may to be modified before deployment:
| `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
| `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens |
| `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs` |
| `enable_chunked_context` | Optional (default=`false`). Set to `true` to enable context chunking. |

*triton_model_repo/postprocessing/config.pbtxt*

Expand Down
2 changes: 2 additions & 0 deletions all_models/gpt/tensorrt_llm/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def initialize(self, args):
hidden_size = config['builder_config']['hidden_size'] // world_size
vocab_size = config['builder_config']['vocab_size']
num_layers = config['builder_config']['num_layers']
max_batch_size = config['builder_config']['max_batch_size']
num_kv_heads = num_heads
if "num_kv_heads" in config['builder_config'].keys():
num_kv_heads = (config['builder_config']['num_kv_heads'] +
Expand All @@ -96,6 +97,7 @@ def initialize(self, args):
self.rank = mpi_rank()

model_config = ModelConfig(
max_batch_size=max_batch_size,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
hidden_size=hidden_size,
Expand Down
12 changes: 12 additions & 0 deletions all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -356,3 +356,15 @@ parameters: {
string_value: "${normalize_log_probs}"
}
}
parameters: {
key: "enable_chunked_context"
value: {
string_value: "${enable_chunked_context}"
}
}
parameters: {
key: "gpu_device_ids"
value: {
string_value: "${gpu_device_ids}"
}
}
1 change: 0 additions & 1 deletion ci/L0_backend_trtllm/generate_engines.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ function build_tensorrt_engine_inflight_batcher {
--paged_kv_cache \
--use_gemm_plugin float16 \
--remove_input_padding \
--use_layernorm_plugin float16 \
--hidden_act gelu \
--parallel_build \
--output_dir="${OUTPUT_DIR}"
Expand Down
64 changes: 17 additions & 47 deletions inflight_batcher_llm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -208,48 +208,11 @@ target_compile_options(
/D_WIN32_WINNT=0x0A00
/EHsc>)

add_library(tensorrt_llm STATIC IMPORTED)
add_library(tensorrt_llm SHARED IMPORTED)
set_property(
TARGET tensorrt_llm
PROPERTY IMPORTED_LOCATION
"${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm_static.a")

add_library(tensorrt_llm_batch_manager STATIC IMPORTED)
execute_process(
COMMAND ${Python3_EXECUTABLE} "-c"
"import torch; print(torch.compiled_with_cxx11_abi(),end='');"
RESULT_VARIABLE _PYTHON_SUCCESS
OUTPUT_VARIABLE USE_CXX11_ABI)
message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")

set(BATCH_MANAGER_ARCH "unknown")

message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
set(BATCH_MANAGER_ARCH "x86_64-linux-gnu")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
set(BATCH_MANAGER_ARCH "aarch64-linux-gnu")
else()
message(
FATAL_ERROR
"The system processor type is unsupported: ${CMAKE_SYSTEM_PROCESSOR}")
endif()

if(USE_CXX11_ABI)
set_property(
TARGET tensorrt_llm_batch_manager
PROPERTY
IMPORTED_LOCATION
"${TRTLLM_DIR}/cpp/tensorrt_llm/batch_manager/${BATCH_MANAGER_ARCH}/libtensorrt_llm_batch_manager_static.a"
)
else()
set_property(
TARGET tensorrt_llm_batch_manager
PROPERTY
IMPORTED_LOCATION
"${TRTLLM_DIR}/cpp/tensorrt_llm/batch_manager/${BATCH_MANAGER_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a"
)
endif()
"${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm.so")

add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED)
set_property(
Expand Down Expand Up @@ -289,7 +252,7 @@ if(TRITON_ENABLE_METRICS)
triton-core-serverapi # from repo-core
triton-core-serverstub # from repo-core
triton-backend-utils # from repo-backend
tensorrt_llm_batch_manager)
tensorrt_llm)

target_compile_definitions(triton-tensorrt-llm-backend
PRIVATE TRITON_ENABLE_METRICS=1)
Expand Down Expand Up @@ -320,8 +283,7 @@ endif() # TRITON_BUILD

target_link_libraries(
triton-tensorrt-llm-backend
PRIVATE tensorrt_llm_batch_manager
tensorrt_llm
PRIVATE tensorrt_llm
triton-core-serverapi # from repo-core
triton-core-backendapi # from repo-core
triton-core-serverstub # from repo-core
Expand All @@ -348,11 +310,13 @@ if(WIN32)
else()
set_target_properties(
triton-tensorrt-llm-backend
PROPERTIES POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_tensorrtllm
LINK_DEPENDS
${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript
LINK_FLAGS "-Wl,--version-script libtriton_tensorrtllm.ldscript")
PROPERTIES
POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_tensorrtllm
LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript
LINK_FLAGS
"-Wl,--version-script libtriton_tensorrtllm.ldscript -Wl,-rpath,'$ORIGIN' -Wl,--no-undefined"
)
endif()

#
Expand All @@ -375,6 +339,12 @@ if(TRITON_BUILD)
FOLLOW_SYMLINKS)
install(FILES ${LIBINFER_PLUGIN_TENSORRT_LLM}
DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm)

file(GLOB LIBINFER_PLUGIN_TENSORRT_LLM
"${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm.so*"
FOLLOW_SYMLINKS)
install(FILES ${LIBINFER_PLUGIN_TENSORRT_LLM}
DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm)
endif() # TRITON_BUILD

install(
Expand Down
1 change: 0 additions & 1 deletion inflight_batcher_llm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ python3 build.py --model_dir=${model_directory} \
--paged_kv_cache \
--use_gemm_plugin float16 \
--remove_input_padding \
--use_layernorm_plugin float16 \
--hidden_act gelu \
--output_dir=engines/fp16/1-gpu
```
Expand Down
54 changes: 50 additions & 4 deletions inflight_batcher_llm/src/model_instance_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,35 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
TLLM_LOG_WARNING(e.what());
}

bool enableChunkedContext = false;
try
{
enableChunkedContext = model_state_->GetParameter<bool>("enable_chunked_context");
if (enableChunkedContext)
{
TLLM_LOG_WARNING(
"enable_chunked_context is set to true, will use context chunking "
"(requires building the model with use_paged_context_fmha).");
}
}
catch (const std::exception& e)
{
// If parameter is not specified, just ignore
TLLM_LOG_WARNING("enable_chunked_context is not specified, will be set to false.");
}

if (mIsDecoupled && schedulerPolicy != SchedulerPolicy::GUARANTEED_NO_EVICT)
{
TLLM_LOG_WARNING(
"The batch scheduler policy will be set to guaranteed_no_evict"
"since the backend operates in decoupled mode");
schedulerPolicy = SchedulerPolicy::GUARANTEED_NO_EVICT;
if (!enableChunkedContext)
{
TLLM_LOG_WARNING(
"Decoupled mode with a batch scheduler policy other than guaranteed_no_evict "
"requires building the model with use_paged_context_fmha and setting "
"enable_chunked_context to true. "
"The batch scheduler policy will be set to guaranteed_no_evict "
"since enable_chunked_context is false.");
schedulerPolicy = SchedulerPolicy::GUARANTEED_NO_EVICT;
}
}

std::optional<float> kvCacheFreeGpuMemFraction = std::nullopt;
Expand Down Expand Up @@ -226,13 +249,36 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
TLLM_LOG_WARNING("enable_kv_cache_reuse is not specified, will be set to false");
}

std::optional<std::vector<int32_t>> gpuDeviceIds;
try
{
gpuDeviceIds = model_state_->GetParameter<std::vector<int32_t>>("gpu_device_ids");

if (gpuDeviceIds)
{
std::string deviceIdInfo("Using GPU device ids: ");
for (auto const& deviceId : gpuDeviceIds.value())
{
deviceIdInfo += std::to_string(deviceId) + " ";
}
TLLM_LOG_INFO(deviceIdInfo);
}
}
catch (const std::exception& e)
{
// If parameter is not specified, just ignore
TLLM_LOG_WARNING("gpu_device_ids is not specified, will be automatically set");
}

TrtGptModelOptionalParams optionalParams;
optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache;
optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;
optionalParams.kvCacheConfig.maxAttentionWindow = maxAttentionWindow;
optionalParams.kvCacheConfig.enableBlockReuse = enableKVCacheReuse;
optionalParams.enableTrtOverlap = enableTrtOverlap;
optionalParams.normalizeLogProbs = normalizeLogProbs;
optionalParams.enableChunkedContext = enableChunkedContext;
optionalParams.deviceIds = gpuDeviceIds;

mBatchManager = std::make_shared<GptManager>(
mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy,
Expand Down
22 changes: 22 additions & 0 deletions inflight_batcher_llm/src/model_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@
namespace triton::backend::inflight_batcher_llm
{

/// Helper function to parse a csv delimited string to a vector ints
std::vector<int32_t> csvStrToVecInt(std::string const& str)
{
std::vector<int32_t> output;
std::stringstream ss(str);
while (ss.good())
{
std::string substr;
getline(ss, substr, ',');
output.push_back(std::stoi(substr));
}
return output;
}

TRITONSERVER_Error* ModelState::Create(
TRITONBACKEND_Model* triton_model, const std::string& name, const uint64_t version, ModelState** state)
{
Expand Down Expand Up @@ -107,6 +121,14 @@ int32_t ModelState::GetParameter<int32_t>(const std::string& name)
return std::stoi(GetParameter<std::string>(name));
}

template <>
std::vector<int32_t> ModelState::GetParameter<std::vector<int32_t>>(const std::string& name)
{
auto deviceIdsStr = GetParameter<std::string>(name);
// Parse as comma delimited string
return csvStrToVecInt(deviceIdsStr);
}

template <>
uint32_t ModelState::GetParameter<uint32_t>(const std::string& name)
{
Expand Down
3 changes: 3 additions & 0 deletions inflight_batcher_llm/src/model_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,7 @@ float ModelState::GetParameter<float>(const std::string& name);
template <>
bool ModelState::GetParameter<bool>(const std::string& name);

template <>
std::vector<int32_t> ModelState::GetParameter<std::vector<int32_t>>(const std::string& name);

} // namespace triton::backend::inflight_batcher_llm
5 changes: 2 additions & 3 deletions scripts/benchmarking/build_model.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ GPT2=/trt_llm_data/llm-models/gpt2
OPT_125M=/trt_llm_data/llm-models/opt-125m
LLAMA=/trt_llm_data/llm-models/llama-models/llama-7b-hf
GPTJ=/trt_llm_data/llm-models/gpt-j-6b
MISTRAL=/trt_llm_data/llm-models/Mistral-7B-v0.1
MISTRAL=/trt_llm_data/llm-models/mistral-7b-v0.1

set -e
pushd ../../
Expand Down Expand Up @@ -59,7 +59,6 @@ if [ "$MODEL" = "mistral-7b-fp16" ]; then
--use_gemm_plugin float16 \
--output_dir "$ENGINE_PATH" \
--max_batch_size "$BS" --max_input_len 32256 --max_output_len 512 \
--use_rmsnorm_plugin float16 \
--enable_context_fmha --remove_input_padding \
--use_inflight_batching --paged_kv_cache \
--max_num_tokens "$MAX_TOKENS"
Expand Down Expand Up @@ -226,7 +225,7 @@ if [ "$MODEL" = "llama-70b-fp16" ]; then
--world_size "$WORLD_SIZE" \
--tp_size "$TP" \
--pp_size "$PP" \
--n_layer 80 --n_head 64 -n_kv_head 8 --n_embd 8192 --inter_size 28672 \
--n_layer 80 --n_head 64 --n_kv_head 8 --n_embd 8192 --inter_size 28672 \
--vocab_size 32000 --n_positions 4096 --hidden_act "silu" \
--ffn_dim_multiplier 1.3 --multiple_of 4096 \
--use_gemm_plugin float16
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm
Submodule tensorrt_llm updated 759 files
2 changes: 2 additions & 0 deletions tools/inflight_batcher_llm/benchmark_core_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,14 @@ def test_performance(client, input_start_ids, input_lens, output_lens, delays,
user_data = utils.UserData()
for i, ids in enumerate(input_start_ids):
output0_len = np.ones_like([[1]]).astype(np.int32) * output_lens[i]
end_id = np.ones_like([[1]]).astype(np.int32) * -1
inputs = [
utils.prepare_tensor("input_ids", ids, FLAGS.protocol),
utils.prepare_tensor("input_lengths", input_lens[i],
FLAGS.protocol),
utils.prepare_tensor("request_output_len", output0_len,
FLAGS.protocol),
utils.prepare_tensor("end_id", end_id, FLAGS.protocol),
]

time.sleep(delays[i])
Expand Down
11 changes: 9 additions & 2 deletions tools/inflight_batcher_llm/end_to_end_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ def test_functionality(client, prompts, output_lens):
generation_logits = result.as_numpy("generation_logits").astype(
np.float32)

print(f"context_logits.shape: {context_logits.shape}")
print(f"generation_logits.shape: {generation_logits.shape}")

model_name = "postprocessing"
inputs = [
utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol),
Expand Down Expand Up @@ -116,11 +119,15 @@ def test_functionality(client, prompts, output_lens):
ensemble_output = result.as_numpy('text_output')
ensemble_cum_log_probs = result.as_numpy('cum_log_probs')
ensemble_output_log_probs = result.as_numpy('output_log_probs')
result.as_numpy('context_logits')
result.as_numpy('generation_logits')
ensemble_context_logits = result.as_numpy('context_logits')
ensemble_generation_logits = result.as_numpy('generation_logits')

assert output0 == ensemble_output
assert cum_log_probs == ensemble_cum_log_probs
assert (output_log_probs == ensemble_output_log_probs).all()
assert (context_logits == ensemble_context_logits).all()
assert (generation_logits == ensemble_generation_logits).all()

if FLAGS.verbose:
print('Response: {}'.format(result.get_response()))
print('Output: {}'.format(ensemble_output))
Expand Down
2 changes: 1 addition & 1 deletion tools/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
dcc9252db07dc9e8d4584fd92dbbf743a6c18b4e
63ca8816513a8afe8fdda1235cd8228278c5a785