QiJune
diff --git a/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions b/‎.gitattributes‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎cpp/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/executor/dataTransceiverState.h‎
Lines changed: 20 additions & 12 deletions b/‎cpp/include/tensorrt_llm/executor/dataTransceiverState.h‎
Lines changed: 20 additions & 12 deletions
diff --git a/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h‎
Lines changed: 12 additions & 9 deletions b/‎cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h‎
Lines changed: 12 additions & 9 deletions
@@ -7,3 +7,5 @@
 triton_backend/tools/gpt/input_data.json filter=lfs diff=lfs merge=lfs -text
 *cubin.cpp filter=lfs diff=lfs merge=lfs -text
 docs/source/blogs/media/tech_blog3_mla_absorb.png filter=lfs diff=lfs merge=lfs -text
+tests/integration/test_input_files/*.png filter=lfs diff=lfs merge=lfs -text
+tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text
@@ -253,5 +253,5 @@ Deprecation is used to inform developers that some APIs and tools are no longer
 ## Useful Links
 - [Quantized models on Hugging Face](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4): A growing collection of quantized (e.g., FP8, FP4) and optimized LLMs, including [DeepSeek FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), ready for fast inference with TensorRT-LLM.
 - [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo): A datacenter scale distributed inference serving framework that works seamlessly with TensorRT-LLM.
-- [AutoDeploy](./examples/auto_deploy/README.md): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models.
+- [AutoDeploy](https://nvidia.github.io/TensorRT-LLM/torch/auto_deploy/auto-deploy.html): A prototype backend for TensorRT-LLM to simplify and accelerate the deployment of PyTorch models.
 - [WeChat Discussion Group](https://github.com/NVIDIA/TensorRT-LLM/issues/5359): A real-time channel for TensorRT-LLM Q&A and news.
@@ -69,7 +69,7 @@ add_compile_definitions("TLLM_GEN_EXPORT_INTERFACE")
 add_compile_definitions("TLLM_ENABLE_CUDA")
 
 set(BINDING_TYPE
-    "pybind"
+    "nanobind"
     CACHE STRING
           "Binding type of Python bindings for C++ runtime and batch manager")
 
 
@@ -52,29 +52,30 @@ class CacheState final
         AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2)
         : mModelConfig(std::move(modelConfig))
         , mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(),
-              worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(), worldConfig.getTensorParallelism()}
+              worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(),
+              worldConfig.getTensorParallelism()}
         , mDataType{dataType}
         , mAttentionConfig(attentionType, kvFactor)
     {
     }
 
     CacheState(std::vector<SizeType32> nbKvHeadPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
-        SizeType32 tensorParallelism, SizeType32 pipelineParallelism, nvinfer1::DataType dataType,
-        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
-        int DPrank = 0, int DPsize = 0)
+        SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
+        nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2,
+        bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0)
         : mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock}
-        , mParallelConfig{tensorParallelism, pipelineParallelism, enableAttentionDP, DPrank, DPsize}
+        , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize}
         , mDataType{dataType}
         , mAttentionConfig(attentionType, kvFactor)
     {
     }
 
     CacheState(SizeType32 nbAttentionLayers, SizeType32 nbKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
-        SizeType32 tensorParallelism, SizeType32 pipelineParallelism, nvinfer1::DataType dataType,
-        AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
-        int DPrank = 0, int DPsize = 0)
+        SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
+        nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2,
+        bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0)
         : mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock}
-        , mParallelConfig{tensorParallelism, pipelineParallelism, enableAttentionDP, DPrank, DPsize}
+        , mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize}
         , mDataType{dataType}
         , mAttentionConfig(attentionType, kvFactor)
     {
@@ -83,7 +84,7 @@ class CacheState final
     [[nodiscard]] bool operator==(kv_cache::CacheState const& other) const noexcept
     {
         return mModelConfig == other.mModelConfig && mParallelConfig == other.mParallelConfig
-            && mDataType == other.mDataType;
+            && mAttentionConfig == other.mAttentionConfig && mDataType == other.mDataType;
     }
 
     struct ModelConfig
@@ -103,15 +104,16 @@ class CacheState final
     {
         SizeType32 mTensorParallelism;
         SizeType32 mPipelineParallelism;
+        SizeType32 mContextParallelism;
         bool mEnableAttentionDP;
         SizeType32 mDPrank;
         SizeType32 mDPsize;
 
         [[nodiscard]] bool operator==(ParallelConfig const& other) const noexcept
         {
             return mTensorParallelism == other.mTensorParallelism && mPipelineParallelism == other.mPipelineParallelism
-                && mEnableAttentionDP == other.mEnableAttentionDP && mDPrank == other.mDPrank
-                && mDPsize == other.mDPsize;
+                && mContextParallelism == other.mContextParallelism && mEnableAttentionDP == other.mEnableAttentionDP
+                && mDPrank == other.mDPrank && mDPsize == other.mDPsize;
         }
     };
 
@@ -125,6 +127,11 @@ class CacheState final
         {
         }
 
+        [[nodiscard]] bool operator==(AttentionConfig const& other) const noexcept
+        {
+            return mAttentionType == other.mAttentionType && mKvFactor == other.mKvFactor;
+        }
+
         // attentionType ;
         AttentionType mAttentionType;
         int mKvFactor;
@@ -162,6 +169,7 @@ class CacheState final
         sstring << "mTokensPerBlock:" << mModelConfig.mTokensPerBlock << "\n";
         sstring << "tp:" << mParallelConfig.mTensorParallelism << "\n";
         sstring << "pp:" << mParallelConfig.mPipelineParallelism << "\n";
+        sstring << "cp:" << mParallelConfig.mContextParallelism << "\n";
         sstring << "enableAttentionDP:" << mParallelConfig.mEnableAttentionDP << "\n";
         sstring << "datatype:" << static_cast<int32_t>(mDataType) << "\n";
         sstring << "attentionType:" << static_cast<int32_t>(mAttentionConfig.mAttentionType) << "\n";
 
@@ -833,7 +833,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     // Runs for 3 iterations or 1 second and picks the best option
     int pickBestTactic(MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
     {
-        auto tactics = mMoERunner.getTactics();
+        auto tactics = mMoERunner.getTactics(static_cast<MoeGemmId>(gemm_to_profile));
         ::nvtx3::scoped_range nvtx(tensorrt_llm::common::nvtx::nextColor(),
             "Tactic Profiling GEMM " + std::to_string(static_cast<int>(gemm_to_profile)));
         // We save space by reusing the same workspace buffer for all tactics when doing full layer profiling. So we
@@ -925,12 +925,14 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     std::pair<int, int> setTactic(
         int tactic_idx1, int tactic_idx2, MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
     {
-        auto tactics = mMoERunner.getTactics();
+        auto tactics1 = mMoERunner.getTactics(MoeGemmId::GEMM_1);
+        auto tactics2 = mMoERunner.getTactics(MoeGemmId::GEMM_2);
         std::vector<std::pair<std::reference_wrapper<int>, GemmToProfile>> tactics_to_profile{
             {tactic_idx1, GemmToProfile::GEMM_1}, {tactic_idx2, GemmToProfile::GEMM_2}};
         for (auto& combo : tactics_to_profile)
         {
             auto& t = combo.first.get();
+            auto& tactics = combo.second == GemmToProfile::GEMM_1 ? tactics1 : tactics2;
             if (combo.second != gemm_to_profile && gemm_to_profile != GemmToProfile::LAYER)
             {
                 t = 0; // Unneeded tactic, set to 0
@@ -947,7 +949,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             }
         }
 
-        mMoERunner.setTactic(tactics[tactic_idx1], tactics[tactic_idx2]);
+        mMoERunner.setTactic(tactics1[tactic_idx1], tactics2[tactic_idx2]);
         mBestTacticGemm1 = tactic_idx1;
         mBestTacticGemm2 = tactic_idx2;
         return {tactic_idx1, tactic_idx2};
@@ -965,7 +967,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             auto expert_weights_size
                 = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1Size : mExpertWeight2Size;
 
-            auto tactics = mMoERunner.getTactics()[tactic_idx];
+            auto tactics = mMoERunner.getTactics(static_cast<MoeGemmId>(gemm_to_profile))[tactic_idx];
             if (static_cast<int>(gemm_to_profile) != static_cast<int>(mGemmProfilerBackend.mGemmToProfile))
             {
                 throw std::runtime_error("Configuration mismatch between mGemmProfilerBackend and runMoEPermute");
@@ -1074,11 +1076,12 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
     }
     if (LOG_LEVEL >= INFO)
     {
-        auto tactics = mMoERunner.getTactics();
-        std::cout << "Selected tactic #1: " << tactic_idx1 << "/" << tactics.size() << "\n"
-                  << tactics[tactic_idx1].toString() << std::endl;
-        std::cout << "Selected tactic #2: " << tactic_idx2 << "/" << tactics.size() << "\n"
-                  << tactics[tactic_idx2].toString() << std::endl;
+        auto tactics1 = mMoERunner.getTactics(MoeGemmId::GEMM_1);
+        auto tactics2 = mMoERunner.getTactics(MoeGemmId::GEMM_2);
+        std::cout << "Selected tactic #1: " << tactic_idx1 << "/" << tactics1.size() << "\n"
+                  << tactics1[tactic_idx1].toString() << std::endl;
+        std::cout << "Selected tactic #2: " << tactic_idx2 << "/" << tactics2.size() << "\n"
+                  << tactics2[tactic_idx2].toString() << std::endl;
     }
     state.counters["tactic_idx1"] = tactic_idx1;
     state.counters["tactic_idx2"] = tactic_idx2;