triton-inference-server · kaiyux · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/README.md b/README.md
@@ -156,7 +156,6 @@ python3 build.py --model_dir=./c-model/gpt2/4-gpu/ \
                  --paged_kv_cache \
                  --use_gemm_plugin float16 \
                  --remove_input_padding \
-                 --use_layernorm_plugin float16 \
                  --hidden_act gelu \
                  --parallel_build \
                  --output_dir=engines/fp16/4-gpu
@@ -222,6 +221,7 @@ The following table shows the fields that may to be modified before deployment:
 | `enable_trt_overlap` | Optional (default=`false`). Set to `true` to partition available requests into 2 'microbatches' that can be run concurrently to hide exposed CPU runtime |
 | `exclude_input_in_output` | Optional (default=`false`). Set to `true` to only return completion tokens in a response. Set to `false` to return the prompt tokens concatenated with the generated tokens  |
 | `normalize_log_probs` | Optional (default=`true`). Set to `false` to skip normalization of `output_log_probs`  |
+| `enable_chunked_context` | Optional (default=`false`). Set to `true` to enable context chunking. |
 
 *triton_model_repo/postprocessing/config.pbtxt*
 

diff --git a/all_models/gpt/tensorrt_llm/1/model.py b/all_models/gpt/tensorrt_llm/1/model.py
@@ -84,6 +84,7 @@ def initialize(self, args):
         hidden_size = config['builder_config']['hidden_size'] // world_size
         vocab_size = config['builder_config']['vocab_size']
         num_layers = config['builder_config']['num_layers']
+        max_batch_size = config['builder_config']['max_batch_size']
         num_kv_heads = num_heads
         if "num_kv_heads" in config['builder_config'].keys():
             num_kv_heads = (config['builder_config']['num_kv_heads'] +
@@ -96,6 +97,7 @@ def initialize(self, args):
         self.rank = mpi_rank()
 
         model_config = ModelConfig(
+            max_batch_size=max_batch_size,
             num_heads=num_heads,
             num_kv_heads=num_kv_heads,
             hidden_size=hidden_size,

diff --git a/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
@@ -356,3 +356,15 @@ parameters: {
     string_value: "${normalize_log_probs}"
   }
 }
+parameters: {
+  key: "enable_chunked_context"
+  value: {
+    string_value: "${enable_chunked_context}"
+  }
+}
+parameters: {
+  key: "gpu_device_ids"
+  value: {
+    string_value: "${gpu_device_ids}"
+  }
+}
diff --git a/ci/L0_backend_trtllm/generate_engines.sh b/ci/L0_backend_trtllm/generate_engines.sh
@@ -53,7 +53,6 @@ function build_tensorrt_engine_inflight_batcher {
                  --paged_kv_cache \
                  --use_gemm_plugin float16 \
                  --remove_input_padding \
-                 --use_layernorm_plugin float16 \
                  --hidden_act gelu \
                  --parallel_build \
                  --output_dir="${OUTPUT_DIR}"

diff --git a/inflight_batcher_llm/CMakeLists.txt b/inflight_batcher_llm/CMakeLists.txt
@@ -208,48 +208,11 @@ target_compile_options(
     /D_WIN32_WINNT=0x0A00
     /EHsc>)
 
-add_library(tensorrt_llm STATIC IMPORTED)
+add_library(tensorrt_llm SHARED IMPORTED)
 set_property(
   TARGET tensorrt_llm
   PROPERTY IMPORTED_LOCATION
-           "${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm_static.a")
-
-add_library(tensorrt_llm_batch_manager STATIC IMPORTED)
-execute_process(
-  COMMAND ${Python3_EXECUTABLE} "-c"
-          "import torch; print(torch.compiled_with_cxx11_abi(),end='');"
-  RESULT_VARIABLE _PYTHON_SUCCESS
-  OUTPUT_VARIABLE USE_CXX11_ABI)
-message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")
-
-set(BATCH_MANAGER_ARCH "unknown")
-
-message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64")
-  set(BATCH_MANAGER_ARCH "x86_64-linux-gnu")
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
-  set(BATCH_MANAGER_ARCH "aarch64-linux-gnu")
-else()
-  message(
-    FATAL_ERROR
-      "The system processor type is unsupported: ${CMAKE_SYSTEM_PROCESSOR}")
-endif()
-
-if(USE_CXX11_ABI)
-  set_property(
-    TARGET tensorrt_llm_batch_manager
-    PROPERTY
-      IMPORTED_LOCATION
-      "${TRTLLM_DIR}/cpp/tensorrt_llm/batch_manager/${BATCH_MANAGER_ARCH}/libtensorrt_llm_batch_manager_static.a"
-  )
-else()
-  set_property(
-    TARGET tensorrt_llm_batch_manager
-    PROPERTY
-      IMPORTED_LOCATION
-      "${TRTLLM_DIR}/cpp/tensorrt_llm/batch_manager/${BATCH_MANAGER_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a"
-  )
-endif()
+           "${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm.so")
 
 add_library(nvinfer_plugin_tensorrt_llm SHARED IMPORTED)
 set_property(
@@ -289,7 +252,7 @@ if(TRITON_ENABLE_METRICS)
            triton-core-serverapi # from repo-core
            triton-core-serverstub # from repo-core
            triton-backend-utils # from repo-backend
-           tensorrt_llm_batch_manager)
+           tensorrt_llm)
 
   target_compile_definitions(triton-tensorrt-llm-backend
                              PRIVATE TRITON_ENABLE_METRICS=1)
@@ -320,8 +283,7 @@ endif() # TRITON_BUILD
 
 target_link_libraries(
   triton-tensorrt-llm-backend
-  PRIVATE tensorrt_llm_batch_manager
-          tensorrt_llm
+  PRIVATE tensorrt_llm
           triton-core-serverapi # from repo-core
           triton-core-backendapi # from repo-core
           triton-core-serverstub # from repo-core
@@ -348,11 +310,13 @@ if(WIN32)
 else()
   set_target_properties(
     triton-tensorrt-llm-backend
-    PROPERTIES POSITION_INDEPENDENT_CODE ON
-               OUTPUT_NAME triton_tensorrtllm
-               LINK_DEPENDS
-               ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript
-               LINK_FLAGS "-Wl,--version-script libtriton_tensorrtllm.ldscript")
+    PROPERTIES
+      POSITION_INDEPENDENT_CODE ON
+      OUTPUT_NAME triton_tensorrtllm
+      LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_tensorrtllm.ldscript
+      LINK_FLAGS
+      "-Wl,--version-script libtriton_tensorrtllm.ldscript -Wl,-rpath,'$ORIGIN' -Wl,--no-undefined"
+  )
 endif()
 
 #
@@ -375,6 +339,12 @@ if(TRITON_BUILD)
     FOLLOW_SYMLINKS)
   install(FILES ${LIBINFER_PLUGIN_TENSORRT_LLM}
           DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm)
+
+  file(GLOB LIBINFER_PLUGIN_TENSORRT_LLM
+       "${TRTLLM_DIR}/cpp/build/tensorrt_llm/libtensorrt_llm.so*"
+       FOLLOW_SYMLINKS)
+  install(FILES ${LIBINFER_PLUGIN_TENSORRT_LLM}
+          DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/tensorrtllm)
 endif() # TRITON_BUILD
 
 install(

diff --git a/inflight_batcher_llm/README.md b/inflight_batcher_llm/README.md
@@ -39,7 +39,6 @@ python3 build.py --model_dir=${model_directory} \
                  --paged_kv_cache \
                  --use_gemm_plugin float16 \
                  --remove_input_padding \
-                 --use_layernorm_plugin float16 \
                  --hidden_act gelu \
                  --output_dir=engines/fp16/1-gpu
 ```

diff --git a/inflight_batcher_llm/src/model_instance_state.cc b/inflight_batcher_llm/src/model_instance_state.cc
@@ -148,12 +148,35 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
         TLLM_LOG_WARNING(e.what());
     }
 
+    bool enableChunkedContext = false;
+    try
+    {
+        enableChunkedContext = model_state_->GetParameter<bool>("enable_chunked_context");
+        if (enableChunkedContext)
+        {
+            TLLM_LOG_WARNING(
+                "enable_chunked_context is set to true, will use context chunking "
+                "(requires building the model with use_paged_context_fmha).");
+        }
+    }
+    catch (const std::exception& e)
+    {
+        // If parameter is not specified, just ignore
+        TLLM_LOG_WARNING("enable_chunked_context is not specified, will be set to false.");
+    }
+
     if (mIsDecoupled && schedulerPolicy != SchedulerPolicy::GUARANTEED_NO_EVICT)
     {
-        TLLM_LOG_WARNING(
-            "The batch scheduler policy will be set to guaranteed_no_evict"
-            "since the backend operates in decoupled mode");
-        schedulerPolicy = SchedulerPolicy::GUARANTEED_NO_EVICT;
+        if (!enableChunkedContext)
+        {
+            TLLM_LOG_WARNING(
+                "Decoupled mode with a batch scheduler policy other than guaranteed_no_evict "
+                "requires building the model with use_paged_context_fmha and setting "
+                "enable_chunked_context to true. "
+                "The batch scheduler policy will be set to guaranteed_no_evict "
+                "since enable_chunked_context is false.");
+            schedulerPolicy = SchedulerPolicy::GUARANTEED_NO_EVICT;
+        }
     }
 
     std::optional<float> kvCacheFreeGpuMemFraction = std::nullopt;
@@ -226,13 +249,36 @@ ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_Mo
         TLLM_LOG_WARNING("enable_kv_cache_reuse is not specified, will be set to false");
     }
 
+    std::optional<std::vector<int32_t>> gpuDeviceIds;
+    try
+    {
+        gpuDeviceIds = model_state_->GetParameter<std::vector<int32_t>>("gpu_device_ids");
+
+        if (gpuDeviceIds)
+        {
+            std::string deviceIdInfo("Using GPU device ids: ");
+            for (auto const& deviceId : gpuDeviceIds.value())
+            {
+                deviceIdInfo += std::to_string(deviceId) + " ";
+            }
+            TLLM_LOG_INFO(deviceIdInfo);
+        }
+    }
+    catch (const std::exception& e)
+    {
+        // If parameter is not specified, just ignore
+        TLLM_LOG_WARNING("gpu_device_ids is not specified, will be automatically set");
+    }
+
     TrtGptModelOptionalParams optionalParams;
     optionalParams.kvCacheConfig.maxTokens = maxTokensInPagedKvCache;
     optionalParams.kvCacheConfig.freeGpuMemoryFraction = kvCacheFreeGpuMemFraction;
     optionalParams.kvCacheConfig.maxAttentionWindow = maxAttentionWindow;
     optionalParams.kvCacheConfig.enableBlockReuse = enableKVCacheReuse;
     optionalParams.enableTrtOverlap = enableTrtOverlap;
     optionalParams.normalizeLogProbs = normalizeLogProbs;
+    optionalParams.enableChunkedContext = enableChunkedContext;
+    optionalParams.deviceIds = gpuDeviceIds;
 
     mBatchManager = std::make_shared<GptManager>(
         mModelPath, mTrtGptModelType, maxBeamWidth, schedulerPolicy,

diff --git a/inflight_batcher_llm/src/model_state.cc b/inflight_batcher_llm/src/model_state.cc
@@ -29,6 +29,20 @@
 namespace triton::backend::inflight_batcher_llm
 {
 
+/// Helper function to parse a csv delimited string to a vector ints
+std::vector<int32_t> csvStrToVecInt(std::string const& str)
+{
+    std::vector<int32_t> output;
+    std::stringstream ss(str);
+    while (ss.good())
+    {
+        std::string substr;
+        getline(ss, substr, ',');
+        output.push_back(std::stoi(substr));
+    }
+    return output;
+}
+
 TRITONSERVER_Error* ModelState::Create(
     TRITONBACKEND_Model* triton_model, const std::string& name, const uint64_t version, ModelState** state)
 {
@@ -107,6 +121,14 @@ int32_t ModelState::GetParameter<int32_t>(const std::string& name)
     return std::stoi(GetParameter<std::string>(name));
 }
 
+template <>
+std::vector<int32_t> ModelState::GetParameter<std::vector<int32_t>>(const std::string& name)
+{
+    auto deviceIdsStr = GetParameter<std::string>(name);
+    // Parse as comma delimited string
+    return csvStrToVecInt(deviceIdsStr);
+}
+
 template <>
 uint32_t ModelState::GetParameter<uint32_t>(const std::string& name)
 {

diff --git a/inflight_batcher_llm/src/model_state.h b/inflight_batcher_llm/src/model_state.h
@@ -104,4 +104,7 @@ float ModelState::GetParameter<float>(const std::string& name);
 template <>
 bool ModelState::GetParameter<bool>(const std::string& name);
 
+template <>
+std::vector<int32_t> ModelState::GetParameter<std::vector<int32_t>>(const std::string& name);
+
 } // namespace triton::backend::inflight_batcher_llm
diff --git a/scripts/benchmarking/build_model.sh b/scripts/benchmarking/build_model.sh
@@ -14,7 +14,7 @@ GPT2=/trt_llm_data/llm-models/gpt2
 OPT_125M=/trt_llm_data/llm-models/opt-125m
 LLAMA=/trt_llm_data/llm-models/llama-models/llama-7b-hf
 GPTJ=/trt_llm_data/llm-models/gpt-j-6b
-MISTRAL=/trt_llm_data/llm-models/Mistral-7B-v0.1
+MISTRAL=/trt_llm_data/llm-models/mistral-7b-v0.1
 
 set -e
 pushd ../../
@@ -59,7 +59,6 @@ if [ "$MODEL" = "mistral-7b-fp16" ]; then
       --use_gemm_plugin float16  \
       --output_dir "$ENGINE_PATH"  \
       --max_batch_size "$BS" --max_input_len 32256 --max_output_len 512 \
-      --use_rmsnorm_plugin float16  \
       --enable_context_fmha --remove_input_padding \
       --use_inflight_batching --paged_kv_cache \
       --max_num_tokens "$MAX_TOKENS"
@@ -226,7 +225,7 @@ if [ "$MODEL" = "llama-70b-fp16" ]; then
         --world_size "$WORLD_SIZE" \
         --tp_size "$TP" \
         --pp_size "$PP" \
-        --n_layer 80 --n_head 64 -n_kv_head 8 --n_embd 8192 --inter_size 28672 \
+        --n_layer 80 --n_head 64 --n_kv_head 8 --n_embd 8192 --inter_size 28672 \
         --vocab_size 32000 --n_positions 4096 --hidden_act "silu" \
         --ffn_dim_multiplier 1.3 --multiple_of 4096 \
         --use_gemm_plugin float16

diff --git a/tensorrt_llm b/tensorrt_llm
diff --git a/tools/inflight_batcher_llm/benchmark_core_model.py b/tools/inflight_batcher_llm/benchmark_core_model.py
@@ -52,12 +52,14 @@ def test_performance(client, input_start_ids, input_lens, output_lens, delays,
     user_data = utils.UserData()
     for i, ids in enumerate(input_start_ids):
         output0_len = np.ones_like([[1]]).astype(np.int32) * output_lens[i]
+        end_id = np.ones_like([[1]]).astype(np.int32) * -1
         inputs = [
             utils.prepare_tensor("input_ids", ids, FLAGS.protocol),
             utils.prepare_tensor("input_lengths", input_lens[i],
                                  FLAGS.protocol),
             utils.prepare_tensor("request_output_len", output0_len,
                                  FLAGS.protocol),
+            utils.prepare_tensor("end_id", end_id, FLAGS.protocol),
         ]
 
         time.sleep(delays[i])

diff --git a/tools/inflight_batcher_llm/end_to_end_test.py b/tools/inflight_batcher_llm/end_to_end_test.py
@@ -70,6 +70,9 @@ def test_functionality(client, prompts, output_lens):
         generation_logits = result.as_numpy("generation_logits").astype(
             np.float32)
 
+        print(f"context_logits.shape: {context_logits.shape}")
+        print(f"generation_logits.shape: {generation_logits.shape}")
+
         model_name = "postprocessing"
         inputs = [
             utils.prepare_tensor("TOKENS_BATCH", output0, FLAGS.protocol),
@@ -116,11 +119,15 @@ def test_functionality(client, prompts, output_lens):
         ensemble_output = result.as_numpy('text_output')
         ensemble_cum_log_probs = result.as_numpy('cum_log_probs')
         ensemble_output_log_probs = result.as_numpy('output_log_probs')
-        result.as_numpy('context_logits')
-        result.as_numpy('generation_logits')
+        ensemble_context_logits = result.as_numpy('context_logits')
+        ensemble_generation_logits = result.as_numpy('generation_logits')
+
         assert output0 == ensemble_output
         assert cum_log_probs == ensemble_cum_log_probs
         assert (output_log_probs == ensemble_output_log_probs).all()
+        assert (context_logits == ensemble_context_logits).all()
+        assert (generation_logits == ensemble_generation_logits).all()
+
         if FLAGS.verbose:
             print('Response: {}'.format(result.get_response()))
             print('Output: {}'.format(ensemble_output))

diff --git a/tools/version.txt b/tools/version.txt
@@ -1 +1 @@
-dcc9252db07dc9e8d4584fd92dbbf743a6c18b4e
+63ca8816513a8afe8fdda1235cd8228278c5a785
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		dcc9252db07dc9e8d4584fd92dbbf743a6c18b4e
		63ca8816513a8afe8fdda1235cd8228278c5a785