diff --git a/README.md b/README.md index 74154082..321a3e07 100644 --- a/README.md +++ b/README.md @@ -156,7 +156,6 @@ python3 build.py --model_dir=./c-model/gpt2/4-gpu/ \ --paged_kv_cache \ --use_gemm_plugin float16 \ --remove_input_padding \ - --use_layernorm_plugin float16 \ --hidden_act gelu \ --parallel_build \ --output_dir=engines/fp16/4-gpu @@ -222,6 +221,7 @@ The following table shows the fields that may to be modified before deployment: | `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime | | `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens | | `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs` | +| `enable_chunked_context` | Optional (default=`false`). Set to `true` to enable context chunking. | *triton_model_repo/postprocessing/config.pbtxt* diff --git a/all_models/gpt/tensorrt_llm/1/model.py b/all_models/gpt/tensorrt_llm/1/model.py index 90fa38ca..7b42f441 100644 --- a/all_models/gpt/tensorrt_llm/1/model.py +++ b/all_models/gpt/tensorrt_llm/1/model.py @@ -84,6 +84,7 @@ def initialize(self, args): hidden_size = config['builder_config']['hidden_size'] // world_size vocab_size = config['builder_config']['vocab_size'] num_layers = config['builder_config']['num_layers'] + max_batch_size = config['builder_config']['max_batch_size'] num_kv_heads = num_heads if "num_kv_heads" in config['builder_config'].keys(): num_kv_heads = (config['builder_config']['num_kv_heads'] + @@ -96,6 +97,7 @@ def initialize(self, args): self.rank = mpi_rank() model_config = ModelConfig( + max_batch_size=max_batch_size, num_heads=num_heads, num_kv_heads=num_kv_heads, hidden_size=hidden_size, diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt index efd15018..4acfabc6 100644 --- a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt +++ b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt @@ -356,3 +356,15 @@ parameters: { string_value: "${normalize_log_probs}" } } +parameters: { + key: "enable_chunked_context" + value: { + string_value: "${enable_chunked_context}" + } +} +parameters: { + key: "gpu_device_ids" + value: { + string_value: "${gpu_device_ids}" + } +} diff --git a/ci/L0_backend_trtllm/generate_engines.sh b/ci/L0_backend_trtllm/generate_engines.sh index 869bbf7c..baf31965 100644 --- a/ci/L0_backend_trtllm/generate_engines.sh +++ b/ci/L0_backend_trtllm/generate_engines.sh @@ -53,7 +53,6 @@ function build_tensorrt_engine_inflight_batcher { --paged_kv_cache \ --use_gemm_plugin float16 \ --remove_input_padding \ - --use_layernorm_plugin float16 \ --hidden_act gelu \ --parallel_build \ --output_dir="${OUTPUT_DIR}" diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt index 25564502..49a8975d 100644 --- a/inflight_batcher_llm/CMakeLists.txt +++ b/inflight_batcher_llm/CMakeLists.txt @@ -208,48 +208,11 @@ target_compile_options( /D_WIN32_WINNT=0x0A00 /EHsc>) -add_library(tensorrt_llm STATIC IMPORTED) +add_library(tensorrt_llm SHARED IMPORTED) set_property( TARGET tensorrt_llm PROPERTY IMPORTED_LOCATION - "${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm_static.a") - -add_library(tensorrt_llm_batch_manager STATIC IMPORTED) -execute_process( - COMMAND ${Python3_EXECUTABLE} "-c" - "import torch; print(torch.compiled_with_cxx11_abi(),end='');" - RESULT_VARIABLE _PYTHON_SUCCESS - OUTPUT_VARIABLE USE_CXX11_ABI) -message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}") - -set(BATCH_MANAGER_ARCH "unknown") - -message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") -if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") - set(BATCH_MANAGER_ARCH "x86_64-linux-gnu") -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") - set(BATCH_MANAGER_ARCH "aarch64-linux-gnu") -else() - message( - FATAL_ERROR - "The system processor type is unsupported: ${CMAKE_SYSTEM_PROCESSOR}") -endif() - -if(USE_CXX11_ABI) - set_property( - TARGET tensorrt_llm_batch_manager - PROPERTY - IMPORTED_LOCATION - "${TRTLLM_DIR}/cpp/tensorrt_llm/batch_manager/${BATCH_MANAGER_ARCH}/libtensorrt_llm_batch_manager_static.a" - ) -else() - set_property( - TARGET tensorrt_llm_batch_manager - PROPERTY - IMPORTED_LOCATION - "${TRTLLM_DIR}/cpp/tensorrt_llm/batch_manager/${BATCH_MANAGER_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a" - ) -endif() + "${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm.so") add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED) set_property( @@ -289,7 +252,7 @@ if(TRITON_ENABLE_METRICS) triton-core-serverapi # from repo-core triton-core-serverstub # from repo-core triton-backend-utils # from repo-backend - tensorrt_llm_batch_manager) + tensorrt_llm) target_compile_definitions(triton-tensorrt-llm-backend PRIVATE TRITON_ENABLE_METRICS=1) @@ -320,8 +283,7 @@ endif() # TRITON_BUILD target_link_libraries( triton-tensorrt-llm-backend - PRIVATE tensorrt_llm_batch_manager - tensorrt_llm + PRIVATE tensorrt_llm triton-core-serverapi # from repo-core triton-core-backendapi # from repo-core triton-core-serverstub # from repo-core @@ -348,11 +310,13 @@ if(WIN32) else() set_target_properties( triton-tensorrt-llm-backend - PROPERTIES POSITION_INDEPENDENT_CODE ON - OUTPUT_NAME triton_tensorrtllm - LINK_DEPENDS - ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript - LINK_FLAGS "-Wl,--version-script libtriton_tensorrtllm.ldscript") + PROPERTIES + POSITION_INDEPENDENT_CODE ON + OUTPUT_NAME triton_tensorrtllm + LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript + LINK_FLAGS + "-Wl,--version-script libtriton_tensorrtllm.ldscript -Wl,-rpath,'$ORIGIN' -Wl,--no-undefined" + ) endif() # @@ -375,6 +339,12 @@ if(TRITON_BUILD) FOLLOW_SYMLINKS) install(FILES ${LIBINFER_PLUGIN_TENSORRT_LLM} DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm) + + file(GLOB LIBINFER_PLUGIN_TENSORRT_LLM + "${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm.so*" + FOLLOW_SYMLINKS) + install(FILES ${LIBINFER_PLUGIN_TENSORRT_LLM} + DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm) endif() # TRITON_BUILD install( diff --git a/inflight_batcher_llm/README.md b/inflight_batcher_llm/README.md index 0c418093..fe096cb8 100644 --- a/inflight_batcher_llm/README.md +++ b/inflight_batcher_llm/README.md @@ -39,7 +39,6 @@ python3 build.py --model_dir=${model_directory} \ --paged_kv_cache \ --use_gemm_plugin float16 \ --remove_input_padding \ - --use_layernorm_plugin float16 \ --hidden_act gelu \ --output_dir=engines/fp16/1-gpu ``` diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc index 0f2ab084..d0659540 100644 --- a/inflight_batcher_llm/src/model_instance_state.cc +++ b/inflight_batcher_llm/src/model_instance_state.cc @@ -148,12 +148,35 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo TLLM_LOG_WARNING(e.what()); } + bool enableChunkedContext = false; + try + { + enableChunkedContext = model_state_->GetParameter("enable_chunked_context"); + if (enableChunkedContext) + { + TLLM_LOG_WARNING( + "enable_chunked_context is set to true, will use context chunking " + "(requires building the model with use_paged_context_fmha)."); + } + } + catch (const std::exception& e) + { + // If parameter is not specified, just ignore + TLLM_LOG_WARNING("enable_chunked_context is not specified, will be set to false."); + } + if (mIsDecoupled && schedulerPolicy != SchedulerPolicy::GUARANTEED_NO_EVICT) { - TLLM_LOG_WARNING( - "The batch scheduler policy will be set to guaranteed_no_evict" - "since the backend operates in decoupled mode"); - schedulerPolicy = SchedulerPolicy::GUARANTEED_NO_EVICT; + if (!enableChunkedContext) + { + TLLM_LOG_WARNING( + "Decoupled mode with a batch scheduler policy other than guaranteed_no_evict " + "requires building the model with use_paged_context_fmha and setting " + "enable_chunked_context to true. " + "The batch scheduler policy will be set to guaranteed_no_evict " + "since enable_chunked_context is false."); + schedulerPolicy = SchedulerPolicy::GUARANTEED_NO_EVICT; + } } std::optional kvCacheFreeGpuMemFraction = std::nullopt; @@ -226,6 +249,27 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo TLLM_LOG_WARNING("enable_kv_cache_reuse is not specified, will be set to false"); } + std::optional> gpuDeviceIds; + try + { + gpuDeviceIds = model_state_->GetParameter>("gpu_device_ids"); + + if (gpuDeviceIds) + { + std::string deviceIdInfo("Using GPU device ids: "); + for (auto const& deviceId : gpuDeviceIds.value()) + { + deviceIdInfo += std::to_string(deviceId) + " "; + } + TLLM_LOG_INFO(deviceIdInfo); + } + } + catch (const std::exception& e) + { + // If parameter is not specified, just ignore + TLLM_LOG_WARNING("gpu_device_ids is not specified, will be automatically set"); + } + TrtGptModelOptionalParams optionalParams; optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache; optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction; @@ -233,6 +277,8 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo optionalParams.kvCacheConfig.enableBlockReuse = enableKVCacheReuse; optionalParams.enableTrtOverlap = enableTrtOverlap; optionalParams.normalizeLogProbs = normalizeLogProbs; + optionalParams.enableChunkedContext = enableChunkedContext; + optionalParams.deviceIds = gpuDeviceIds; mBatchManager = std::make_shared( mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy, diff --git a/inflight_batcher_llm/src/model_state.cc b/inflight_batcher_llm/src/model_state.cc index 2eadbde1..d03fba10 100644 --- a/inflight_batcher_llm/src/model_state.cc +++ b/inflight_batcher_llm/src/model_state.cc @@ -29,6 +29,20 @@ namespace triton::backend::inflight_batcher_llm { +/// Helper function to parse a csv delimited string to a vector ints +std::vector csvStrToVecInt(std::string const& str) +{ + std::vector output; + std::stringstream ss(str); + while (ss.good()) + { + std::string substr; + getline(ss, substr, ','); + output.push_back(std::stoi(substr)); + } + return output; +} + TRITONSERVER_Error* ModelState::Create( TRITONBACKEND_Model* triton_model, const std::string& name, const uint64_t version, ModelState** state) { @@ -107,6 +121,14 @@ int32_t ModelState::GetParameter(const std::string& name) return std::stoi(GetParameter(name)); } +template <> +std::vector ModelState::GetParameter>(const std::string& name) +{ + auto deviceIdsStr = GetParameter(name); + // Parse as comma delimited string + return csvStrToVecInt(deviceIdsStr); +} + template <> uint32_t ModelState::GetParameter(const std::string& name) { diff --git a/inflight_batcher_llm/src/model_state.h b/inflight_batcher_llm/src/model_state.h index 2e1b59bf..fff1a1a4 100644 --- a/inflight_batcher_llm/src/model_state.h +++ b/inflight_batcher_llm/src/model_state.h @@ -104,4 +104,7 @@ float ModelState::GetParameter(const std::string& name); template <> bool ModelState::GetParameter(const std::string& name); +template <> +std::vector ModelState::GetParameter>(const std::string& name); + } // namespace triton::backend::inflight_batcher_llm diff --git a/scripts/benchmarking/build_model.sh b/scripts/benchmarking/build_model.sh index 625e5267..661f72fa 100644 --- a/scripts/benchmarking/build_model.sh +++ b/scripts/benchmarking/build_model.sh @@ -14,7 +14,7 @@ GPT2=/trt_llm_data/llm-models/gpt2 OPT_125M=/trt_llm_data/llm-models/opt-125m LLAMA=/trt_llm_data/llm-models/llama-models/llama-7b-hf GPTJ=/trt_llm_data/llm-models/gpt-j-6b -MISTRAL=/trt_llm_data/llm-models/Mistral-7B-v0.1 +MISTRAL=/trt_llm_data/llm-models/mistral-7b-v0.1 set -e pushd ../../ @@ -59,7 +59,6 @@ if [ "$MODEL" = "mistral-7b-fp16" ]; then --use_gemm_plugin float16 \ --output_dir "$ENGINE_PATH" \ --max_batch_size "$BS" --max_input_len 32256 --max_output_len 512 \ - --use_rmsnorm_plugin float16 \ --enable_context_fmha --remove_input_padding \ --use_inflight_batching --paged_kv_cache \ --max_num_tokens "$MAX_TOKENS" @@ -226,7 +225,7 @@ if [ "$MODEL" = "llama-70b-fp16" ]; then --world_size "$WORLD_SIZE" \ --tp_size "$TP" \ --pp_size "$PP" \ - --n_layer 80 --n_head 64 -n_kv_head 8 --n_embd 8192 --inter_size 28672 \ + --n_layer 80 --n_head 64 --n_kv_head 8 --n_embd 8192 --inter_size 28672 \ --vocab_size 32000 --n_positions 4096 --hidden_act "silu" \ --ffn_dim_multiplier 1.3 --multiple_of 4096 \ --use_gemm_plugin float16 diff --git a/tensorrt_llm b/tensorrt_llm index b57221b7..e06f537e 160000 --- a/tensorrt_llm +++ b/tensorrt_llm @@ -1 +1 @@ -Subproject commit b57221b764bc579cbb2490154916a871f620e2c4 +Subproject commit e06f537e08f792fd52e6fef7bbc7b54774492503 diff --git a/tools/inflight_batcher_llm/benchmark_core_model.py b/tools/inflight_batcher_llm/benchmark_core_model.py index 707f0199..e8106133 100644 --- a/tools/inflight_batcher_llm/benchmark_core_model.py +++ b/tools/inflight_batcher_llm/benchmark_core_model.py @@ -52,12 +52,14 @@ def test_performance(client, input_start_ids, input_lens, output_lens, delays, user_data = utils.UserData() for i, ids in enumerate(input_start_ids): output0_len = np.ones_like([[1]]).astype(np.int32) * output_lens[i] + end_id = np.ones_like([[1]]).astype(np.int32) * -1 inputs = [ utils.prepare_tensor("input_ids", ids, FLAGS.protocol), utils.prepare_tensor("input_lengths", input_lens[i], FLAGS.protocol), utils.prepare_tensor("request_output_len", output0_len, FLAGS.protocol), + utils.prepare_tensor("end_id", end_id, FLAGS.protocol), ] time.sleep(delays[i]) diff --git a/tools/inflight_batcher_llm/end_to_end_test.py b/tools/inflight_batcher_llm/end_to_end_test.py index 9c6c1d5c..581a9d34 100644 --- a/tools/inflight_batcher_llm/end_to_end_test.py +++ b/tools/inflight_batcher_llm/end_to_end_test.py @@ -70,6 +70,9 @@ def test_functionality(client, prompts, output_lens): generation_logits = result.as_numpy("generation_logits").astype( np.float32) + print(f"context_logits.shape: {context_logits.shape}") + print(f"generation_logits.shape: {generation_logits.shape}") + model_name = "postprocessing" inputs = [ utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol), @@ -116,11 +119,15 @@ def test_functionality(client, prompts, output_lens): ensemble_output = result.as_numpy('text_output') ensemble_cum_log_probs = result.as_numpy('cum_log_probs') ensemble_output_log_probs = result.as_numpy('output_log_probs') - result.as_numpy('context_logits') - result.as_numpy('generation_logits') + ensemble_context_logits = result.as_numpy('context_logits') + ensemble_generation_logits = result.as_numpy('generation_logits') + assert output0 == ensemble_output assert cum_log_probs == ensemble_cum_log_probs assert (output_log_probs == ensemble_output_log_probs).all() + assert (context_logits == ensemble_context_logits).all() + assert (generation_logits == ensemble_generation_logits).all() + if FLAGS.verbose: print('Response: {}'.format(result.get_response())) print('Output: {}'.format(ensemble_output)) diff --git a/tools/version.txt b/tools/version.txt index a5d5936f..86dc23dc 100644 --- a/tools/version.txt +++ b/tools/version.txt @@ -1 +1 @@ -dcc9252db07dc9e8d4584fd92dbbf743a6c18b4e +63ca8816513a8afe8fdda1235cd8228278c5a785