/*
 * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "BatchedGemmOptions.h"

namespace batchedGemm {

namespace tensorrt_llm {
namespace kernels {
// clang-format off

#define TLLM_GEN_COMMIT "a845b78"
#define TLLM_GEN_EXPORT_VERSION "6.0.3.0.2.1"
#define TLLM_GEN_BATCHED_GEMM_CONFIG_HASH "ea4dc24"
static constexpr size_t tllmGenBatchedGemmListLen = 216;

#ifndef EXCLUDE_SM_100
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
inline unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin = nullptr;
#endif // EXCLUDE_SM_100

#ifndef EXCLUDE_SM_100
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len = 0;
#endif // EXCLUDE_SM_100


static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
#ifndef EXCLUDE_SM_100
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "eefeffd9eadad2fea04dcc26952e24722385e97fa67adac2f9c0b1577f47cf03", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "e0c5c8fd3f7b8b90aa6489808cbd0f4a23e821384a3c2401d806bc706e207ffb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "cca9316d286245d1c6c7f067f740e7e2c4dc5bbb60decd26cf41948c1297e14d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "7f4033df8fd884f0151410156b8e67b8916df09458e5cacea22bc55658937fa2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "2eab8679f43b1fbb35732027a87698224e76e7d4647e9153eb56b0283a98adf6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "13e58cc0c326753b1b9198f24be7c39eb0c130d28b6d745e30f19aa8b78f4d4a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "c004d8138c3d9deec1b75d3aa8fd9edf1cc6714dcad17610f7607c8ef2c6791a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "785ddc56dfe5575006a2c53443e3a1628d79391b45a9e5df8e394c6d34a27500", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "53253aaa3e1eae65aa85a7dcf2ad8f2648b1bb54b45ddcc1c71817e2250bba72", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "a32e55efdd9f642ba2009ca2218e48270a546f8be1baf56b200b42ca5d4119b0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "0da42d4dd50ad6b918492635f9a86ede7040c522f9eab92c2b5bb9bde8e19f5a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "966d4bfb368c853b8b94b3d2921e683fc5ceecdc71f968f984a9f982f1b4b0d5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "05da9f4d356497433cf7fcdc69edd7737841ec59d63905c890bc2d3a78a0ffcd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "fe18af185d92f54add51c70d89b460bd99d5cbf5b5bc31bfd467dd2bb2f48aef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "5099139e216c7e8c55f1c01fec733e03a842610f12d4edb1f088adad44cfeaf8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "f751209abb6f1e64b2666032c2ad87723811838a30ca6795fe2228000fc1f54b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "5ed1d900e48f6cdc12164c37ae5d9bc5eb36a8e2e542ac38eb1ee6133a0ef623", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "d770189eab9d5439914f4d5b35c3175b86f43a01525ab5263bcad25471e2955b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "8efd0b9fd8af7f3be0116f8c38edb5ad1495a2a694b1b5a7a331793d341146ab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "e392a77dd452624b3424df344aa6c1a85a51df0e2a930a62fd43d874b88c6dd8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "c9a30e4c90e493a5dba78ced1fc6e8d40854efa695c849a85349ea8b0a97e24f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "7f9a49f25155a370100a908c24e028eb817f9b897c318d05394aa046b30140fa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "a53fa9ebbbe2314cfb538499ba6aeda534caa15ed21c993a81b782276626d644", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "07535e06fc9f656e5cb4d92046f3ee5c7761c1a3464e5a6bafdb49cdd1338d45", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "7fbea7da050eda6dbcc8c75863aa1f287f27180323cce1834d0a43cf2cd41362", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "323895f8e1a0bb51de7ceeeea5db1392c5f5f452ce4b2531df19705da6042b26", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "1fec9ccd0711a62dc45f0e438f377408c881fdb58e7ef0a5366c88880b22c3ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "38e6f7c08acf4d75cd27a9a954a3880cf5e9de2749d52c035031c337ddcd5f15", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "5f781916f90615e74393a8ba49c9856a3634e282d0455d4d9051a0f67a91f772", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "b6baff41f23e7403e09a3e59ed1a0b36ce8a75dfd427abd8a033030ba2048074", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "1c3a7fe3fbbc58d7e0bb144942f6efeb20bfb54a6b5924de29db0b2d58770ed5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 480, "020a92898fe8031cdbe96b33c780a2f005c626a4fae099856703ded545e6dc85", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 448, "97848ac497568950c897958f7bca7b2b42bfc48f3973078914058eed1db76835", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "0cf5205a2f2ba4b4d0452087c0f79fab92465369d366c4e3e4aa5f947f6b8341", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, "d98ec1cf7eb356250aaf3ecceb75d45d2306c4e8eed851f55109ca48c82b99d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, "9b7efa6fea7b0ee0b81b371288b8c4781692378ecbbb974eff3656a5335e90b4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, "0d130aa379c7f458ca911cb5f4eaa7a5ac9c2bd09e01a3905a4e249805b2e1f9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, "9eaa28148f68fa77461a400669f6fabdf8f5c6baf2602b09109132d7d8d04e17", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, "c64de23272af8eeef40683d3981e2d1d0a8b88e799272fb0795030b00359e2e1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, "58e9dd97934a1ca51419c1e498da13ec07e76f802ff216a594f980b4eba71bc6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, "f01da0a89976c09a4b4bb4fd0ada2a89888c24291ddf3e4cba77a535ee4db049", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, "74b1f993cb10b32c6e495d3f2732ed729539799d901dbe6e075094a5f82d4391", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, "d0d3abbd3e982f5e90e350134c84b4cb98d9a0ae3a1d530145b97b6eff8636c3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, "d8b3a6bea1acbfd8be7d778943a77c2433efeadaed818c37ca952c20171696b2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, "2c3a6f1f2b6d2d415529e6d9f9e9162b3e44fa9846dc57c2f88353f92a471fd5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, "691a8c9a76fb581e13c4d6aefb4f3c080749372e75b4ce2c5e130f7998add146", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, "14e921de30f7340294fa6c64e09248d6c515f54b3529293be43b20d405e1448a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, "d5846db71c190d10bded1229efe5210d729c957caf5a4fa9e7b0657a8b73828d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, "b5b7170bae335e776c845c35c66687038ff2c3e05fb630182bc86eb5d65475ac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_dynBatch_sm100a", 416, "8c232526ebedd5ef335897711ed8b61f9d7a102d643bb09d43af3c2ba387466b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_dynBatch_sm100a", 384, "dd1688489e9c1b695a810d0161160b9d29ff4441409cb4b8ffade8e4f3efa755", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, "489eadff4eed34445dbe0350e8ed916ad16207c557bfd0da76be804550b29f71", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "233c45fe86005c4c089bdbebc2a7dfdc451afd580e647a7207f97a2b3ac5898b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "c7cf1855fa90d6bf7e961555cfaedd588d2072e8a82ecdcfefb91d436c13a577", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "fd250a70cb3f0f82662c28aff92c74e2def025cb8ef68c0c0a5948da06846af7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_dynBatch_sm100a", 256, "dea7b6fbfa5b04718523199f47c3aac1b5659a49495b06a6e5f27b5ea6e0c3d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_dynBatch_sm100a", 224, "acf142ab40b0e008d8125bebba611187d5a44ea035af764994402ce78601a457", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "90ba8fe2010b1b007f0f87f9bf63600f67dcf98d8fbf9003eb0a2a6952ecddad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "b00f31e47fff0c62cd7a9f777ec0dbb37b099ce313fd1516c1e852869045727d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "4bdb9c47a7770200ced6492953751598be7a7ae46aa44a653dd0fee5d10d65fd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "c4a8ae55bc1985835ddd8be615891ead5fe882ceeb756e520ba91f6ad5f14d18", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "ff1e7fcfafa850ae2fbb72ab5b9932a10d760ada589da57bc82376ee32c5f73e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "f1c5a4f183e730386a580ec7270bc403b26012280d1f7a895a84fa410712b589", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "889425f4a76965ca805a72b3115ea31e30d62898efe1ad1907b05b5467630d94", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "8561b55c34c1c4a0bcfcf97b8f35a025b5ccb0033e6122da0bd29573bab2fcb0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "b33ac587a41606e939313a9f545615c895c919598307c96cc1c6fd040b7329b0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "8ebf5fa60923ba9e3a9acccc72606f58bd68379975e63978783320ad49495599", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "4941d914188cf9ca2e42e53a0f22a4484e8c5db404c92551b8e813e91302a6c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "e46f678531a58e49eab2500a783091105fae95a942fc0a29d686bcdc6b1081b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "0f530c0e107b44c7adb38b0bba5cb0a1f13fc623040b2afed1aa9f498e66ee0f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "337b9baac85e4eb7634f2fd2c299e31de54a9166c31e5334be563e24caa9be72", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "6e078c0b0d80a70d88a2107e6245cc6dee1cfe5a15c8d02e73303bfe324ac6b6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "b016ac599564f08751b042e8ac098de59ae61a90f3ba552d33c77c8f0068ac21", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "6b8427ac5bf6c6f453eecd4e25bff8094071253a80c60b74fb0dc10a7a64e8d7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "7192474aad18d02237034420b258d57e35d1aa28e415468ec7de38358d323c45", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "742b7b646972d4813740637d3f7bdd2b5781599bb33a578b099379b335d1796d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "7e2566336585da861fb42cb8a52426bcc7335ff715ea424d3c8f171a8381a63e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "a568e2cb47b71c7ffbd844c2c8390ae40b641dabfde139f664205a773e85c7df", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "5ca0dae96e7577914362db7b5a696659f7267680d3928a9d47251e7bf208c5a2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "bc8e3ba87856892550a05baeca608177fb16b88a0f875073332159995038a4b4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "59fd34a4d4038d06d8109cffc10b4a88da1cf4a0e607bf9d82293317a93f7d2a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "82ad26e7394cf1c4f682d3316d0ec8bd7e6ef6db2959bff8e3bf217deaceef4c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "af6577cfd5284768c94a27a87615a02bf6f96d613ab99350097590dca3ad8021", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "ccfcaff283c39869cae2b061218341ff192a77eace88823fa16c48587d64aff2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "7bc927bec780b1e38556dc64ddc1e97bb3e11e84e70563533a833bbddba67ce2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "842909d71b78c882467675f50b874ac02371b26e1e7bcbbe9b6dc614f2ffeae5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 416, "588ff82deb3de487aadc52a0e699b0cc9b28eec41e36d447774837d148ac6017", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 448, "9c15248ec77db7cd8a7e573d79a3f35fadd69cd45c6ac535df3e3d8f335bee84", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 384, "eac9f59231be3dd0183c403d336222c63af9762ad87167bfb1db612287a8272e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 416, "4e570892f5e9fc3a5d8cf4b19eadfe0394ae3282a929631a1be5c12de85bcb9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "a20a77fa6cb073077ef1040787ff105401829f9b26310d1cfea067810ee22fbd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "84095a6198ecb86804b646bc0d51da8358786643c9d15d7fa4ff1beba90ff6ff", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "73a1daf3ae85a97cc05cd2a7795c00bc12e10d2d875d7832347b978c6cd2ab55", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "9442a6ebb0b7ff88a1bb4c4a699cf95e60f35bdf8cae8d9e40cd27f7f5f9ae74", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "4644bb5466d570fb1e6e32c229f2676b4beb6288f15229ebeed598d9a6f4bd5f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "3e8049aef91a2b5258bfbb431e4ac46eaa24746bc4e24ce3804b3e524c6531a5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "d39968a13da207632c056e7c237888059a59ad850818110c10a7e8f30800ac37", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "223a321b86d79abab41c56eb9690d75530dd27626de3b033704a07426f5cc212", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "4ee25bef3b4798c6788bf5b69b8c51af09af7d7405320f4e43fa1f6dade1d72d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "c081a98fcfff619c87e7ca1e899cb4317662adacd3cf10322300798fad4f6851", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "01b9b88d7ce2e2ca1032b0d106fd37a47f1dc164939beaceb27b0b2a6e9b8848", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "5c6f5770c9fbe5648fc11a18198f64216ed9a6d9ccd8a62a1b79a29b8401979a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "a6819c636fa998a32c37485116f5d21a61fd775ec8bae841d1fb2fab9e58d568", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "c486aaffbb66d7c560684e120fddcf3bf095b926366894914a13974f85cdeb37", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "a0fdcd77ea6ffb1f5717da331d02ab1e74750b0ce48c03890c5fb58fbf182ba7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "536925dc87df0f5f2936b851e0631eba5f4f6170e9e054a7fe7d93a604c3cca0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "a086b20fac3c2956203acf0ce29e329737483db69a9856c142a739101d184026", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "c24f0d82bada70f6609a77154da00f5b3085968cb7dd69f0a0838f48f7f5ba7d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 320, "e60caa642896321d46c7d83510940774b665100997d7661776efa86c44957123", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 288, "b89f879ff7ca931964dea247fe135ca0700b82763ba858e0291c8ff74ecd0abe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "e65e0f7c2f4d1ae72f39c7bcca37be98bd9a74952e1c8c498756aeba58eebdfa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "74a0b6b6aaba9bf647292a97f8929992d5f149ad802a7f7d735cab949691f370", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "9ed9f840dd6f38d4236b00ade9001097c82a541ec8f245ed5caf0e0c019c3af8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "3dddce9688a7b630f7d2668559665c0a5d0dc23c918c057e08957837b8eea741", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 227328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "58d4fdff0fa008cd47743dd5bf6b613e7420ca662795f78e4fbb247676f9852f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 227328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "6fcb9456d98d5235a653f0f10a3a75cca153151067dcbb8722c734442d2191d7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 227328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "8968bca2459dc4bb5d3263c3bc0cdbab11ac8ccc2ce80dc9b4878eea5479c6fc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 227328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "c0526a15f354825a2315882a60f5b9850fde234bc53bcb608f1149b6e479e9f9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "a1546623669d37e7520ca0604bc2c1cdc63936c0fe0af9e8ca49af4313dc134f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "8e48ead84e3276b67cff395ce43631485abe438c96c80932d5bcde4087e84c8e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "a729a9766d3054cbe48cd31c3530ef8d2ddb178188092a2609f3605b8e45a753", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "97a5c183bfc91b821131636d7fd554a72092b3da8cb78675ddd443781a548691", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "cb310a684ae4715fc2c56bf54d79cccaf6a094f6c94a7d4d58a037a3d7571a7d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "da9571de70399efbacf5b4798861b9032e0eecc45045f5d49e2d362cb84a6996", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "ec0740f26c52417cfe1ee8f0d8bee692ba8750eef5af04b269c7e5a020845701", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "60504e87bf2523c155a1d939cd8cf0da45799badeec9781a8e4582fb2c082c7d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "88d573c74e9adbcd82cb63decd9e893aa46ffe2196a1b2ba386a4fa66b6766a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "e60a2e771af440fbc8544712677809a0b8a72b594d0970cd04b9043a0d778ef0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_dynBatch_sm100a", 480, "65dae7057830270f60d3676572ee6c216779d56b0d308e68f5ff3d05d836d7bb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_dynBatch_sm100a", 448, "57e47ae56fc081713fcecfec653701dd9090b9687484e7ebc7171653ed5f6e25", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "81bd81bda167a9663653dbe331ed7b993a5e907b1fb74ef595772708dc881158", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "8f80c050f9ba6bf6526d80f4a33df7883483bd260681de2cc342884c4a9746ef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "56cf1510024095853b1e2deb6b8c510405f215a5f183c9e46e8e12d1af8d3d69", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "b20dbd20c1b257a9e614fc29a2c33a443df4386eb76b44e43da4fb3011fbcd29", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "6a2b99ed29ec21a7d6aa0b66fa51a4e8d6a84d81facd96a8444b2db4c574abbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "ffc90cb5819a2596d09b67fbec0b6ec59bbd7ae947d04bdc482146ca3b065f58", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "e9e1acc05ec4ce430a8659b69f4e9e41c2c8df4d5ee980be8a9740e164ed05f7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "90c3b2e5080980efac0eefb395b459a383742571ce5a68570ec5ba3116c593f6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "85964ec3e0c31e371e4de59599fcf431834d9b2a7444353a6176ddccc47e1e79", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "7bcb20fbd09409913683c258ce3ae4e9a4150499c8b6fcd43084a9fa2cdce342", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "bc8302758999935f2ae219385f2479e8d2c951cdecfb6fead212c025755b4d7e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "13647a006d8721c2fc54f95b9f156cdff32c0d315952db51460ca4110b119382", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "484bfdc3fa183d5d1cff9a202146386c749a049da7e4cd6af6bde393a050a50d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "d6db4da3c97559effc0a3d1b67b8fcd0b08bd01d2129c37cc1c8f0c6766f3a5e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "78ca4ce5b1526d7962684c46ea7ab365f30b2d5de323543d97fba2aba19adad3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "6cbf89cfc92f62d7230a4173f0e41189af9e01b2d16f56adf713ef6db82b7f63", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "88a5a45be66582a55182cd984cc60e5eb059ef29ed268744939796855c34cf1d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "da31f1bd0de8f3a9bbde6496d3da3ac9ea50ba08b212f3a1f6b6ffe902ba859d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "be9aa956c8c22f5e2a4d52f376a4744ee48a17ed5bfe846cfda1dee5a5121f12", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "6d459fd2b42c07b0cbc6a5cd0da76dd5eb06c5c80eecedbc031ac7098cf6f16e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "b4dd33b5fd84ce39638eb91a33b10dc31663a499e195d08a5b9474aa79e2852e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "e063b549378f0715b9609e670565e20a7f3ead207920faba523c082f4455cbd6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "bc7bdafed4eedbda526cb7145b19f43acd6dc40c3dfec66a6d2b59d0755647a6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "4bf8cfcba43a70a69aabfce998a8fbfc377d76aba02e454e77d7a551524c41d8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "5de78539a5fd7202ec699420ccd5afe389158bcda1b06c409837f90fef3478de", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "db95e37999315d1abd62a01fc35382d9c8921ce9138ecf36c0cb7a0e72f44d6a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "a22b05126656c79b3126ac14357db1d3fef1c98d2e0d9e043564a1b2bc5813dd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "888697fb276a39df1ed4bb81a996af30ae56749970fb52f4750cee741d4bc80f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "549e116c9fc19c20d439706d9e078e351fb41ef029a1a3ab6124a8ceff43d71f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "dc875ee2691c465ad925722617b63ebaf118785acaf66cc2a38e2afed4898830", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "82979042910d40cd0e3c478b7c4da21749cff5555e915b91609027a768b5dc15", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "12da23ac2476e73def7e989263d9c29b88605be097d1a94de38d55773ad3b788", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, "af848612394f5745068a967fcad1b586b973008eb320f06828f88072e21c4fac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_dynBatch_sm100a", 448, "79033e637cd059c21967fe6b71b42af5ec957ddd128d393200650b5736b1fc15", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a_cubin_len, 149504, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_dynBatch_sm100a", 416, "d206145d255366f55d1dcab29fd25546d1c11013dee83892e4f057a1eee6597f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, "a0687e7bb666e508820e6db1186750485776bd56b78d8c599817037a3aa9bc59", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "86fcc35cb05914c983332d1dfbdebe3ade8a0ce676035ebfafa5ef51d6fd6019", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "f8db4ecdd878060c477268f840b077cccd0565d174289efda44cbc1f504b51e5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "2aaaee49fd39f1b3ae1e7f671f1a7a32ae861f62ebaaa165ef939075d9ebbcee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 216064, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_swiGlu_dynBatch_sm100a", 256, "085d5a3bb458501e819613afb2366c1ada7c06bef9af134667c5b63cee970e56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "111f0d568193eccfe6010c06ecb6fd5b876ca4dfea8fcdad73d11c0521350b62", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "5648ab96ca9b21a6b638300d2340eb65589b13227f2e16db59895a030b316cf9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "123302482d518a005445496be835ab3f80db330d2c5f3976be52a2920eeac22c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "c237b882569bff1ea0f03f0078bb1b3adb70c71103dada9594046596140c7d36", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "47d48e420728fe1bc9460db72352bde8fa25c8001cac629f4126bcf840633fe3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "cb8f26830e4fa5841e522483d1b56684bce536f195d2bbe72cf9f7a8cf60ddf9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "a35983fe93ad44ec32b6749877c14d94a575563c10c7c3b03020d3b4bd79decb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 222208, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "64aebe31d8fc378713e9440f028839346ebc8ee8e08340c36afd48386ba1723d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "7c219650d5fc87696db9f11fb0c015734fbea5932ce4455136c04f70d84f98d1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "c18c85a252faeb8e983673834330a44c382036ad4cca23f3736c5ab031f829dc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "717366f31fc3ada9fa722efa72fffdbf2057997ef74a6793e27f2c80b28cdc30", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 217088, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "895fbd772a17901628e51cbbfb374e6a08d9287369c1c09dfe4b8bfe9e3b19bf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "95da8de06142f5981a1a1aac58823e47d83506d841f65b6962390ef120d2f4dd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "dacc79d214f5203ad693101db0c8aad3656b9761cde5442e93f35ee408948e18", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "f608afe81516c5d9a51c083bdec0f7ba921a3f0b7b8b13f4fb5d8558011c7679", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "4674933283f116997db4c042ef1a9b5129367f239172be65c12d563049f37c89", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "7df598b10bcb46ddc201634bffcf4521c1d7e013aecc97a439d243f435e27f91", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "7700f22b4d741486d18a769f8ad2f3fb1406358f89d107df64db2b682a8e19a9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 352, "c05a470b05ab3195a14e04e7485abd7a11184e19ca53a37b7803437385d7d47a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 320, "b85ae8f57dbaafe24f22237b9bb6bce5acacd5ada9220609cfa10289b3d2ff96", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "99ab26b95e0a99fe1f31304d1fc23bdb822a35726f7745c2fc80664b54a4b38d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 166912, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 448, "9eaad6f96aaa131777e9314694b7528c1ce8e9cf1c3e245e5bbb7ae770f5e771", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, "0465a13ebbcb691087d65e792a70600380921f71fc8d59312cf13b5fa8eec6c5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a_cubin_len, 61440, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_sm100a", 384, "415efe0c47e6225bb3f48283eb6653a7a36bd1975279470c23c8e635509b26a2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "81495845035f9056c3decaa85dd29c948da52e5461873668f9a080abd771b2fc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a_cubin_len, 217088, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_sm100a", 224, "e851a23fae03c20e8a7ccf2a2aed3566fbdd34cd5b10930268251ea5047338ab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052679)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "3d1660793413dd8ed4571f1c2429bed3eff6ba31109fbf73d2913ad79f8d8d9d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "44d5efed73fa26b2b8f09855ed72a25e18f952efbebbc8f9d0b134adc19dd919", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "c2ab68b67173cf0ec955829f072a6c6ba81c87099d81a52bc894ad88a734f243", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 199680, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "0043967eb354cc3839b7d0fdc9580a8b81927f1608a8b8c4095ee99cd0e27c1b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "5c9e22989cb49522c7131dc6e2fcb4dd2c0b7e9948a264cc78197363e8ab7935", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "b6104f4c89ad44733423acc64d7f01d7e02e4641b097c919bd17bf6f2e0bdd87", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "b30d525ba398ccd9a71fd8deca038c8bc45c5e4788be7474cac4449db221f589", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 223232, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "7349edbb3a185af9bcfab74fd5917b5c779ea6cd88471a3829b74d776aef594f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "a31168deae55668f34ca82c962cbe0513fbcbb66a8ed43cd9f16ef96428ce9c6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "90823320f0cc7e980c3609e1ab5f497485b4019ccb4dc28da4c1a033d9184039", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "8a30bcce3b4470d31791b30dcbf0f1a26b72e8311a426c42512051e019f9ecad", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 220160, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "2c7533b7ee128fab42cb91e036185351f837d1b6831a412e3912fcdfeb0fa907", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "58b743ccb3a6d34531be0a6610b8dd493d05883bdbf1ac72bac72bd06f35192e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "1b549664262b07172b3e9434143b3a5f0aefa73eeb77e9a142088687b5d36ece", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "807cf68972502047f33aa6f835cbceeedd9091857ecc813df596b9b5aca0de55", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 188416, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "883b62f66e3537ceca42568f6993c21e9c62c088a27f98258ce09dd4bc6cec71", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "8e311531e593b36615829cda559adce0e1ff9c35bee1ff27ebd394787572c2dc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "1376d19c975c9d3742d472227940db690e8414f83a5757ffb01c2d8c7c8c6065", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 512, "cf223484c6b0b667491e8cf55f32a8ef9fe1f21108ef4d94addd89d46e4d38bf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a_cubin_len, 224256, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_swiGlu_dynBatch_sm100a", 480, "5e6b12bb111e19c54723027bf1c5b3e72e8e8c0ae02aff3bc08af02981419ce3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(17827853)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
 }, gemm::SmVersion::Sm100a },
#endif // EXCLUDE_SM_100
};
// clang-format on
}  // namespace kernels
}  // namespace tensorrt_llm
}  // namespace batchedGemm
