/*
 * SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION &
 * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#pragma once

#include "BatchedGemmOptions.h"

namespace batchedGemm {

namespace tensorrt_llm {
namespace kernels {
// clang-format off

#define TLLM_GEN_COMMIT "32110eb"
#define TLLM_GEN_EXPORT_VERSION "7.0.3.0.3.0"

static constexpr size_t tllmGenBatchedGemmListLen = 408;

#ifndef EXCLUDE_SM_100
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
extern unsigned char* Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin = nullptr;
#endif // EXCLUDE_SM_100

#ifndef EXCLUDE_SM_100
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x64x512u2_s2_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
inline unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len = 0;
#endif // EXCLUDE_SM_100


static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = {
#ifndef EXCLUDE_SM_100
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "3f50040873bad87a6494c9cfe1fdb74cf105a262da64d6eeffb7cde948e4c8b0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "421c9180d5ef84c582c13ca6f758e3eb88db2c0221d6d5358481994c793fff9e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "9a5af1b5f22f8cfad3585bc94c7ab9e3889608184af7a90fa72355ec3580e37b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 136192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s6_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "fbb8bd03a8e4ccf046a6d26ba70538b6c9295f38d9558fe74c472dc6f9500407", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "1211763de22658770e6fe0eeeaf075b6c3b9fc404184e27919e920c9cd89ae6d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "80f36089bf16902e0cfe5530fdf73181181e42bbe1148a06f528c9fcfe951c1f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "0382740f8904e76ac8ba65bd139c0703098be937ca6a48f8d805828d2e9fb4d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 178176, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "e9ca5eae59b91497e401da818bda7a66c07c5c03b9b12870b8b46b1ead340290", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "a18ea12a8da554dd67a5cade11d3e2162bb2cca0349a3858d2b18de571423898", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "c8691567af794c5de14d5a5cc4f2a5d28536ca6073422b4df47c9afbb8f2be06", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "28d8f178f4347d48c162b4a35786bdab9378c9dc471a02c80b3aed6650a4e6a5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 154624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s6_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "b8269703cdd5409fa0bad9f8ed695dca41ff739af1ef94b346c3f0be0c404760", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "f30f8d237cd1cf14e5ec6e46384e15fb0f29b49ac06d115760f7cd5de96139d3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "9bf99980b7a6c15a6ae98733aeec0db642207ff052c875884147723f39b6413b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "78318af7a0405b458217987427d2fe31e668b1e5316c3d14d72ec23b71117aaa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 200704, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "c9df13716958e4441c1d29ac5c1fa6cb2d293389fd8f779bc03fd2abfe62423a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "0a9146fb80f1248e9b89895999cb31834c3994ea3b490ad9a8a444386742d16f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "bd37ba431b1765a9ddf90d5f46549ef5bd453a20037cdd497baa79be476b781b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "f6435ef5c7d9d09aed443595c849477def79f95a9995dc0a938aed6ff6591a16", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "47277e74746b699eb163ed30cb063fec743743f969b881247a8723296927a452", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "4226a60504e7275e4b4a42f764e98294258e66f8cc4a10e14e171df7b2f5066b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "bf4fafd9e3cb30a9b0446dd8838849db06a959f16689df21dfad17513682bc63", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "86f8fffb1cea50303e40100d3bc51e0be57b3a70adc2a1859a8694f79049349d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 190464, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s3_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "1e121aa979c8bd4768d793bdb0003889e7be3f4a31ff2624b285fe4c92e7cb87", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "7a0b234601ff19b9a74f9a5e049cfa9bbb2412a588027cabb73ac41726388a05", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "273bd4ae9a7fcd4dc3b4beacae4660a8aeafe93ec01cf13c99ddda7cde90c6f6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "235c7cecc53e4b5b0fca2a72f8e9a467e9fee921739087acf3a27cdebc688cc2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 126976, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s6_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "f14ef8c6fb913f5fe433f4f7cef57d22453d9827e152b7998154f6701193d4c7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "37779c1a8bc4082b5c10f813cec67884a076d704450abb72006730c9d643da3c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "82c9d523368e936cecb66b5964259c128095eefddc099fa5679c963d70ae1a86", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "132269771417a940aa39f43c0dbf496c585ad385b38dc182f28274c97307aca4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 480, "f564fad6aed8f446292c1019b7ec6fd8700afefe5ae9db03f9338e153fc8685f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 448, "298d1a1794a17c5d0585684e0b33711daaa2f6e0b00c8f47dc3f9e08f68c79f7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 166912, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "7bb43ecdf9e503d1f0b510b6161ca4c7c98390fb36255fa2c4c094ed474cf381", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "033f3c0b194580bd4a8e0d32296ec950d3336a90f22c1d2c19601b3f04706025", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "a4853e328a5ecc75c50ffa766da7f3dfc2474bab0d31e5b17fca6323b6dd06b9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "10f1d796023187047bba9a36b6b4be2cf3625804b39e9119ba60c8f9ebafea56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "e90f00f9cc3a99474284d1d40300be041cc4616412dedfbf3af6fd8991fee6aa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "b9ff246be6e1494a34a8fd737cad55ce0d7cd492f0ad0981c11e7d8e56a1b6fe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "4c7c2cfaf1a4ff72cbcc6da48792ebf84c4a01cb17e076bf5bc71a4ca3863853", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "23af50bafd4ff3219b3b2e77185babd14dc564196e59d4a7ce0c71f9f63f6e1d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "63ec7f175f16f2148f9112466e7759c6d080de0f21c0657ad4fe32903ed220e8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "0227acf26857469604694c037b2ec247e95f1d363b502579227bddd36474d22f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "7649909f8952a983e3573f19be8d0eeef90ae853f3730cae014dd97bfe3ca225", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "0b0535fe5e1dc64350dc3b525ae1cfa4edf7c8e5ce140431c474786041e74f53", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "f40181a72c1a3a9de321ea65d10398214ee7f7de27a8e74e67b6d32e58194807", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "87fe035315d329d83ab5dcd4201a173a43190ee7ee2949a0836309afad140734", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "c92032ee7452c8f79ac9526fb9db84b0d50ab18da19011dfdcba6e6b3d852397", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "033db21cba9333e6789784c5d851b60e431e5d421753bd2713cf3da0063fc283", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 121856, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "3be81e7035ee5a5d3488f9501197fea2caf7159de69217eab8d041dd013f5f7c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "97266c0137ed61bca912f69fb299aa6369fbb4bc53ce3883031308e93c8923df", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "bf85855e9e7a943c409e87a393fab52c09d8fea4e3d49e750f8b463576555d23", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "e749eb37196ae08cf93ea4f2a7e65b085c2cad49d109102ee211748953e57b01", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "4cd14db05dde5b92cfff810910d8fca58b1607850cba9a6fcaac41dc204398e5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "834628f9c031e0e74b5634a732bb1af96039f7736fb2c39c29efe97a60435d80", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "c6f35190ec003d65f903d0d017577377ffe50a4c9ba8ef7398232e514e775bcd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "7400ee66aa0eaa3218c7e43e06c3b449bd9d263fee808a62e3019cd67cf71c78", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "401fd2bfcd51e41ed9abd426842f6d9a8e59ef3b85fff0b4e43a15a59dce992a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "dc8fa6335b2667f779f474cbb5cfe8931fdc8b6ea6b54bce03d15f34a555b71c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "e2e6c0e37a9bc30aedd1219d2e907f25d70ed2d0c507e82b4eac4ad9fcbe354d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "c7fd0a259771c1c208fb20b3f7a0bb9859a47b6a655e47a4b03ad8a4567f32ee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "5c3392364b2d53496a1505a6bcb3b179ee718a91aba166d1352a89e6f29447e1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "8cc703d5b28e8bb936a4ca647f749aba2b49a41cbc8c33dcc874b54b074a0f8b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "bc9c55c5b547cac8ecceb6a48b3efa2f53c58fd7b1ffb8f8cb0a232c40af29af", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "14d6b39d1ca45a50381c359b6c5cbc432e5b61f4a3e5aa90ed1769eb5d8c394c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 97280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "fa0384182891b60447683a86f986a9d1193388760b3cd27c8440781bccb513ce", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "d687e41aad68fcc7fcb695bf62dcac712024fa879e7fd370cbc55370e542e940", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "eb7cf8dfe1dc909900275dc5ff5693591e5546eaa195941336ee4207dc54aa67", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "a14678fe5f84b7b8541398c5bc5fdb732cadc8d974dd985e6a907cf195c64813", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "12946559dc5d971ad4ded3d1faa415eec7b395ff8b3e55b5f0da8b372de0dd18", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "648ef4107c15ad51db68a834255b4fc43e099b684f2ec5df19d7ba7473e26eae", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "9a90dbaa0accef76e6b3cce8c8a5ad4339bed854a69bfd186519652af3cce24b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "8da6a223f8df73432b0601dc953b4ecd3542a2e499ba1ccd36a9dabb8d5774f2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "5dbb3a228c68dfc4cef3a5c987306fa6e387b801f133d57b897bce0626fb07cc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "ec38064002c223d2f08fa823db4f902e671f5c1b7029e70ff786a0a08c129341", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "dceea69d8a13feddcb5a63e3300afd3760bbd4186de64294f0452243fae2a8b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "7cc2536574de984495b44de0ef8f6c45f6381f9abdaab01f0cc367c0a8661a69", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "0a74ae88978f2aad22d47816aeb36e1229b398cddb63d2a1c00266e33f98ebb3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "dd18f671d972032a78dd12641870f989cb3b8df3176fd867331224f492d74227", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "597374f4d8c6227a98a134bb20ff6e45491f6f994d44633b4d8a339e8c50cdb9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "cb8335a75a566e201ce08b7bd12a270adaa71f0c0a7ced206684fb8c544f891b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 123904, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "c2ad2912f7315a2abfddc2e8a2f007306ff689da1475a60b7b8e219a97dd2fbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "027475aef4ed9229b874e3b01dfd136f909f1c1d0bb3fc382e283f944eb7fe06", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "98860c3803b347f64ce0a07d13e1e713630b45a01c705a70eee3611ba0e650b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "4119c179a085bacf3f07d57cd888269cb12b8cb0e8ea12d50d1add78ffdfbb4c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "6475afd431e333182fd56f19060a99c16dd676c15115a99b877ca582737e269e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "d27479f40eb777039050cf3452590e2a0c1fd35437b1535fba4861b89874e677", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "28c787a8fde1474cb1c432c3299d42896b1660cc987b1696671819ed2ce96b2b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "47dde169efefa34d2faba0364d087d1fdf986134d9e1de0764aa733354336d2d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "e53508cca19d5b67ad3cc789f344377bb8896494c05bfcd8bb90c01198b71899", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "5fb79245cf1df99a0cf864e665c2098da795adbd377e2e4bdd24756e30661744", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "69abbc9a29f37e190aed3fcc1c7a2cb4f2c0ced73b42a54e7f3b8d84d0ec508a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "3ce175c6dfbe2ed1296a15a5573169ef8f3630e81191fa8bd1183784a6907ada", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "43bfe6367d451c1641749b52244ae40ed9bc084d9ce825e78705e42cee76f7c9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "d7c94aa9865646a2d3d7b36b7daa9b06b6145688924532f6f1fcd651ea1e448e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "27f5bed11c1be2270461d7a16bd04c2d5b0c3d1da365926a02b694749f047fbf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_sm100a", 384, "ef5d746adc2a09e3294b52ce6684d33f8cef0fc6734fe70e38efdfc105642e5c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a_cubin_len, 61440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s3_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_sm100a", 384, "10a957da353c29e2cd57852d7dcd90d3fea0980ce130d3ef3d5ac9f6d27f742c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "f8c169eddeb2274d509a10fcfed8754a2b9657267d56e804da0dcc5f72ba54a2", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "6fe46321db31e54185cbc03e20e154fb7895c0ad1888365b9d51cedd63d5764c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "3ff9f922cf74e03cb5b3f03172d4be34ceadf911ff3b9ce6d6b646ace5eb9a77", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "b4c54f4f30f019d312346106faa8384065f7e99e9803e46343e7838a1a818664", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "790b2b58ace3c7a9365e8f37f34de8e2b6dabb5c7742cdb33c06d16484383ae9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "a916d59461e1c44f8897a40d017f42b68f1fa86e93679517fef80c1bc49f043d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_clmp_dynBatch_sm100a", 416, "e40531b337de3dff738fd228d732675485ba0e32a3a0f4cfba34f106edce258a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 149504, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_clmp_dynBatch_sm100a", 384, "53d992332e02a59947bd40929f9c8efd3da79234a77d6fddabd868d58bdff5d5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 8
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 256, "b9f33b3ab29d19f2db6159a55ac761b1e4b128b06f6d7108881aeff8ae59dfff", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 224, "1db2412318ecc631ab85bd07feaf9564167ccfa09f89d4aa61110f221fb6e279", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 224, "9c7d64b40f4417659ebc3447b30a198f2eeff2bad4e30453f0dc3a00656d33db", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_bN_clmp_dynBatch_sm100a", 256, "6f2017653064416fc97c0dda5e5070d3d72e07e2a7c423f7ff085162898f08ed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_dynBatch_sm100a", 224, "7313e5403903ff8374ebe03bc3069a248caeab61f28ae356afcde042a4b13996", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 217088, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 224, "16180b696decf4306a25573ae19dad86217014aecd96aaa44de1b513f3acb422", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "52b258085b7e0970dc5268ec7ca7ca3a5c4980b94e2b63991fae61dac9f2a763", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "fe0e4a968b4472f8856c2198f59dfb3fe778e733b166fc0fcc042d822964ceac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "3b112df01a3b928b1b1bb19f278011a18e9b866715cd19887ccbe200dac218d9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "d4aa941092162be422042c26e6868843f229c8d3f374a6760d0627922d0105c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "f3edac4357dc88e43c54167c4314f321943c0e63eea05c8234ee121074b57e3f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "fa31b14ddda97d48a48d09d14d3c9ce86fdf9285344875bb3a8ef7a717c8bc9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "dfeda97aa906fed9d8564bf3c1ddfbde59633837fb730f60006fb593ac60d51b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 88064, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "cc440cac9aacdd9008b8e8b1ab1040459eda7702fdbd2f895bb4bc926bd2fc4f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "06048dcfaaa81ff67c1f72634fdfbbafb81092dfef6bbb28c7b75e1d57b4ff2a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "b85032d1fc6b97f44cd9d3778cba54b25bdcf1cedd90efe34d15f8ecf6eb691e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "9a52732e9b7fc056e74775739c71b9659f9561386dcfb7ff18a13f634ca59ced", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "448185b1fd23f3315d5b1fd2ef833066d53184700117b86fa3b6bfa33a820c5c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "84ea4dd3bb574e7668780351d3c33c7846921eb7fc2a00794a7cbc91c96f7ba4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "4cde38e9d6178a2da1028c81bb7897fd0ed5be177cd949ce7f8887ef1f0bd811", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "79bbbc653e8be33b7e8b2ca2400a7e2a9eb94752912f18fc291a0d3b6407e979", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 116736, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "5c1d3679cb47511def1d4db96988ba8104914592a46b4ad3df554e2ffa482577", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "5c579bd1d00f0f0ab698aa5cfd70596283f367c962081c707b7b8f101ab84793", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "d2800340cfeda9ccccbcc4951b64233263b58bdc12e79896f61ea26cb41edbab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "469c956f7e2e095908ddd733b1db48d1effbff506bca5efc43f7395d72c1c48f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "13efa9ae5dd6d8e88d98c2f5af777eef65a538a3530cccfa08054c6e26714c93", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "a6e68338e9fe01626be6c254deda824bf588a69be035e195254de082ca68c723", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "28fae92ddb56b8569597955cef7b840c995d6d8c375bd0b6404086dfe722b025", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "c4a5d32e52bc38ebc69fd0102bd1c4f4444f114345b24674832acae0e6b34a11", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "9763f6a7f20c8fa74157036a62a755dc6569ebee9cf054d1c088182633bb234e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "3a28fe8644a7bc7a1cc014a8897ab0fe2166436e2cd5cfaed18d8fb57ad69413", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "fae33ca8316876b52ad4442738631714bb966dce0092b74026b4b1186ec074b3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "84acc24ca690945c29621f601d016a8dd6a0d1469b8d735806fc726b41bd1793", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "b00a33a971a83513b638849fb432d0fbe11eb4b38ae4a696227295964b939aa5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 416, "9e7147c1111d5312284b3711b5cb5e5615212529b87d67cf319d923276dc7d31", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 448, "15abd2a84edab7a22acf98b74499a0789fd767a68a813e147f6ecd3372f0f851", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 384, "b386b8c7e1c9339659e6837240fa2f21f8fe927dd5374fce061d57517c9fcf1e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 73728, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_patch_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 416, "ef2e3792a633ceba03139e91beb01517ee09d2b7539f67e146be12104e284280", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1052672)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1052672)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 16
, /* mMmaKind */ trtllm::gen::MmaKind(1)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 1
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "38b5f0e760783280d227c998b300f0e8dd71a3f26373714a22e2f31348a3e5ed", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "a3a12a8fad98acfb1dee2a7c4d869a39890ffce9d5db5966da9a82a46da2ed62", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "6b3293be7d956ba7b71371bea3abaaedd83b8f579db9a059c6fc99693223c4cd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 201728, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "22b5ab9b379205e9fc715a447023dbd202303b0f8d491ed3bffc77c33be2587c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "ef6fd532b1f40ca268e0ae2f2dd2c862ebf43e67bf57058d889346a0e25f1725", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "33a419aa9f7d3efe0f05b76483b1f745471a65cf59cfb76946d3abca189152af", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "169635b45a94a5ac3d7abc68ac2d3de3cd5438afab71b672b4f2eec22f16cdd0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 226304, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "839da645ae7ddebfa72c9a73ee25bc0af44c233f92744a74ae2b3a0853caceb6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "0994bfa66ea8a078b03947f88850366fca52db246742509924bf7a6f213c7bfb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "ec379b195e72575bf96f9ff16ea0c35eb91e9b291a5a3719751407d6daed6337", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "5b5d1151de0658df84e2810b3f1916d297be4ce3651d95d9a9de889e4afd1c86", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "3b0858a36167ad32d9905500e2aa59743b6d91bb2c2373a70f9c3814aa074795", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "91874a966f6eb543323f8467c2e3f66600f20f0ab8754026abde6b510c699f98", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "955ae7dd9ea62cf762ca07c8ee875e24303b0d95b46b03768c07ad3631e5b143", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "e11a6189ded24497219f2bf335f60a83fb17e163dc32459c7d166a015e2720be", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "be1acac35d67130541d30ead271c206f7ac6f70ac4a0e46db4c1845053a8c3a1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "0033ff8a409809884ee01fd3837975710d5cf654422b8055892c126cfe11419d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "7ad06f231f869ea9e835d3d1083e027d7ecca6f7a5fb03752f9d351f7ae2fc45", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 320, "620b6d07cfbcf2cfbbc16d4e4dedaba7eccdac2f878dda447431a34594cb95d4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 288, "4e8fea6373057cb4cee2f2a32a4d0edee7817c060b3449471b0001bc9dd2d482", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "0b7ed55068401563c1d4f1b8d0decb49821a0aaa2151c02ce4feca7ca038329b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "198533be3cee3890a08a67e379a1223f4daec31058c520726cdd29bdaf13560a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "f1fd3a527c984a2775575ada2a9f8c3dc5b1a67a16e2f8c33b7a5536cdc3a00e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "d54f4f208b17054371d86be133399e8eca22070d2698208cb8706b537baa67a4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "be7f908008125834306e4050d77214cb231f8f405e7eee09d01defa33a3cf2f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 125952, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "9a71d9dabf06f8c485511ffbf2596ba8c56d05acc22f82fc16b3f53dc35d74a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "ab7b410bd93cefea6d39fb831410c13fa2b9f72d772b7055f9a504f4e0b9cef9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 163840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s4_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "bb56c004e9600811b1287fd79ee6b2d5ba1a6111bbebb3a01590b4a52042dcb7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "3dd7d632ae2bd741bd66105cca2ff630e928956a57cb269180ace28ed0a2bb9a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "675e511eb3d5efc57b05609cfa044a3228077d83033845d272790997b613346f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "13f1e99d4a76143862bac3e5f420a10587bd6be5286615de4aad9b2ead3948c7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "9bbfd28c75e78ae54f4d2b58aff19bea505e96dc397d3dfc5031c815601826ea", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "712986701528a94354455e39c9759c5573c463a8a47df822c2286a65fe0daefb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 142336, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s3_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "ea38d7b3c60b20595e79c692903917e5cee01bfc2ac9e35af00ca03286e62956", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "a1c9af9aa0f57eac713088b8b31f5b216a145c503cdf7944880bebdde359dbe8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 185344, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s4_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "aef9a91ee0738e331f8d7c3bb3faa862113a247b9754ba6e12ac9766d452eb27", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "64f070388a1a75610038a835e814097a80e468aa6386be16006bc24289a245e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "1b6974f8108e1b0cdc6e3b066b84e0ec4cd580cfd3267ac1ec941bda41d954c8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "553f0aa838c31e24f41ac380d797de027c41358352544642cf585919e69b22dc", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "5336f9f353d3b5d2a204a95b22ea52c774bd6362ea2659a53a401062512cea56", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "587c54db90f77353dc7eaaf0130bc816e1acdc27ff84207b7ae45af5f203e63f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 177152, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s3_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "79c219a8c1326757393006cdaf72de6389a5aa9a3dda955141b54e4f61e832b4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "e2e741c1d17440c4463d8f941baa4dbd9d63ac2ef5f5183df5ad1bf25cf06491", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 228352, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "3cc64b02c875b4660ee70fa2db4029bee0d7c60eb95446852a6732150133b6e1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "1fa7ae2040138a8e919c787e62ff4d5e235ac397ff75c78c56ca1eed1ea2c113", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "b98f704b2003d04bb36e2a9307a7319e6471e7d4ad7afa2080b657c34ee6d2c8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 159744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "037db60f5ab7b61a4e80ffaaa61f36fc32dba72bacf09334b15e1da592a53332", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 151552, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "bdb9e4f703807f6ce8abc9ae7bc8e4b8bc6f1dc4d026a35d482d0ba0ea2e85aa", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "e89cf7342fc0013e957c2f58ddb53f84d05d679462c152378ad047134817a689", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "27c64ecb23ece3132d8fa547e9d577d20042063f526bcc73d16a10e9e53481e9", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 195584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "11d7f6c4c4a8d7f382fed2b2dc7f608743064587b19ae1b96d412d9db106a290", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 187392, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "3df6963d871d8fc2915f6e4be2d899ead0f06d4fb3156487ea39e2febda7bff7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "b419076e6f8be20e164ea1ed8d4770b9aba41a20baeb35ba8e0ff72f9c88f5f7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "2bef689b3d1f86873e618973d7aef905630574fb54b043dfd3bd1dd6d14d6f23", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "e425bf76a44f9c244a8b04add431d3dec324e745c3ab58df1890d42c335f8e80", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "ceec79d213aca4da2dd3a230b3f81fa5d8a39890fa81befae3421de09d5316f8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "58962324203c255d56409d2fd7bbbe91872efd46168e157a22ac1ac70b7b42b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 153600, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "ab07787ab1420a630f9cd816a0151aed9b5b3f6270429b42e9255c227676e6b3", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 159744, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "c25077e7fc632546ab3389dace009de63e2972d38b9e77e7b980d010b15822c6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 151552, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s4_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "534eb114f2c8c1b77de1f33bab90959259d163280e912429b945302d0e06cf28", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "1862ac649a69e4b532651012a2dff768d45cbca4cf467ccc4041910eb1de1176", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 189440, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "233baae772c1b23100c09b46a79cedb27f13605556f0bc3e60a8839dde6a39e0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 195584, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "c21ac81f0799a8365f557430e12ce95f448cdb21eab8fe146533078b644c2d3f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 187392, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "eb55eb91b197919dcff2a184c1037c07363cf1b5a4c68d33ff53d738266af975", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 5
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "df7d18206ee418bfc9a636f5c326ea35a6c687fb519c3ed4e8ad1fd4b7e224f4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "6377876adb967003b602d3ad5e786c703b1ff72331929d490c28eb0f7c863bc6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "615bd354244e6679359592eb69ee4bb1fac02720ea6de3d254f5ef4234a5a208", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "a41e0feccf2584546ec060c487d0bc2f0a5b98ae348cf9c593b51c98ec6fd6d5", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "e464cee5683c77a9b0a8710a79df18c81c30bd1e1ad70cc1110f508283d74a70", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "1f1b53b8475b8915cb927f17cb914c8f39d3bea4b19cc2dc08a9cb18b13376c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "b6fedcccdb8c1f4a716e93165cd6dbf826b06889abcd2b5548192dccdc5c2464", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "6f3ad785800894aaca7d8ff8aa211b31800d2e21975bd72751a81f0687e3f2a7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "7399ce8bb98825c9f4dc063932e7bde7d4d32aebff3023d53fca90155fb42fef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 225280, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "cd7dd69304a1f7208ba22a4dc2630741bf84a4ec67a664abdb7528a30bce4511", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 231424, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedP_biasM_bN_clmp_dynBatch_sm100a", 480, "3dc4e382e28111d76580bd38dc1af4da201b2dea9f0fd5bb1cc0adf35b9d8d81", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a_cubin_len, 223232, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_splitK2_TN_transOut_schedS_biasM_bN_clmp_dynBatch_sm100a", 448, "9048f79c720a2ce32103f677813c48ff741b475431290107f693960b4144637b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(1)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 2
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826828)
, /* mDtypeB */ trtllm::gen::Dtype(17827853)
, /* mDtypeC */ trtllm::gen::Dtype(1052672)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826828)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17827853)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(5)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 2
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(2)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 0
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "da31f6d9c67d08e85b7072d5a43266d0d6ee30735817b37f982c0477de645a06", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "42e9065be76998f1a64683085d9ee72071e85c3f6051fc9aec97b8cb4a8bc7cb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "46c62f5dc36d3a412416fb040ad22f3bbd4d65f5706fd2f32672bb629dc6dd18", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 175104, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s4_et128x16_m128x16x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "8e5a000b9df97eed3710350f1126dc698bb7f2cac3ca94e063b469769a176366", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "5d621271aba16b84dfe6283b5d93ac500800b567bff705d8a73d2d2a963e67bd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "17abea2a2e8181c85657a1c3654da4d5783bb186fc0547237b6cb88dff148d31", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "ef8e4838f68f4020afda48c29d032ea4f56239df6b101d1b8f6c3a15f85d744c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 194560, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s4_et128x32_m128x32x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "f306d4c0b809848b99d22554426480770767cef45047dec3e80f8b161f2c140d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "39dc8af745963ed8342183716ed83d62266c28ddbe2290a00b1401de33537e42", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "932c455add6e5927757c2a56c4adfc81ad2dc114bd0c55924950dfc424432fd8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "7492c306444a92b35ff1666b496b0b518b1e5e1a3bf571334d4e29ab6129666f", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 178176, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m128x64x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "c068236aac9e7192e3c904b839834bdda81a1f8b07d2f69dbe032d0d05374822", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 256
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "e470d3aabe53bc2484d130ce968c0ebb8776d1a2c563377c0e6a128ec42efe2b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "0e1aa67fd306295f4c21d072e436bcdc1d51fef87aa2fa0b0493b91aefc481c4", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "142afcee25cb84bf6611830b8a71e127deb69232f580a05df56e6ee505a09f05", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedP_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 512, "238320111ff22f9aedabaa9d79954e7db0a395df3c3ff50af7306616307219b1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_clmp_sm100a", 448, "ba95baba14a5cb89863871c716d247bef2b461df8c88a1fd6668e782254bb8ee", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 0
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(1)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 1
, /* mNumTokens */ 0
, /* mRouteImpl */ batchedGemm::RouteImpl(0)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 165888, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_TN_transOut_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 480, "9ae64768181f24c6ca90e899261eabbf76d93319ca54c212cac0764e5f9f89b7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(17826818)
, /* mDtypeB */ trtllm::gen::Dtype(17826818)
, /* mDtypeC */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaA */ trtllm::gen::Dtype(17826818)
, /* mDtypeMmaB */ trtllm::gen::Dtype(17826818)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 8
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 64
, /* mMmaKind */ trtllm::gen::MmaKind(4)
, /* mMmaM */ 128
, /* mMmaN */ 8
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 8
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(0)
, /* mSfLayoutC */ trtllm::gen::SfLayout(1)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "8157f44703c59388fb09afac934d18fb37a94aec57207801f07c8603687ecdbe", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "7477fff17f203078d1a94b4ec28bc0b061c1a1df194da9dfa3e313820f3c010e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "2f8ad526b885e8baa349e382a4ae0ac982d886242594141ae02519979b71b13b", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "3e8760064685db77751a0651e21f944c694263647635671dd3958ed3f7fada28", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "508b4b08f46d803db6ece157302800429d3a315b448f82c594f09dc224054f50", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "fb124fa8b6e12b989155fab1c39e1e9c02f8aaa7b10a48a990e1e69f87baa9d0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "7cec49ee4eb48238caac11db363f407c6af4b5155fc1f422a89ca35ff92d2d7a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "b63a1328798a2b363926f1e908e874ef1a94f5dfa4bdff87f24731d7c5a68636", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "bfd97bbe6574a722b831054f05d8f9a60f921836e77fe307bcae58e37198be70", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "ab4005174cdc2b54fb322c32c184952ecbcbdfbd99d173d7e242d382da88fdf6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "3c46c6b249e5a0cd7340f3dbc7b8072077652ec4c00f825b388f8b8bbea69cbf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "d38baf93035e39c7f6d3928ac5f5e95037202d5f5a668c47559e47725049baa7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "3d88d7cdf8878e120633daa4940007ff6fd4ea0c78dca63819117adeea74afde", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "5fc17978448b4e1bbac488501ae56fbbcdade025754f46e2cb05e94a63b67a82", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "2eaeea22b7f06a714c66e5e7cb07a926c9813c6442c7f81b0d21624d9c8a5920", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 119808, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "028d9e988fc094d7ffe0d9b9416236f89fba7f1bcb8a5fbcfb5dc499b2174955", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 6
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 229376, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x512_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "f29f44b4a5e9666850a7ae0503f150a76fa4af45d18242fa53652c7830a492d6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 229376, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x512u2_s3_et128x16_m128x16x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "e3c30edacd1a1361c6e983c5724253f6f0963c2af08a722f855e0a476fd02d41", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 16
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 16
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 3
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 16
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "3156acfad50fdc17cad8a8361003459321a3464ced26e0028ac7b0e323b134b6", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "96fcdb017b1d06c94709523848b757776a2c00ef601e2b7583f0f6d84e762741", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "020a6868ec1616be5d6672af219d2f37babd931ed22ea7f6b351c51be22f2b2c", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "d1f59e9d0d394769ff8709dccaf513fdf564efd007962aac9ab9f6cc4ab5a0e7", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "7612966658dff79d86fd595ddec5ffe687dad3df5381593a88f837720c236964", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "1c6e04f534cadf6f08823442154ac262c67ffcf39f9ccac08a21a4d9d2a901dd", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "1133ac386e288984301755cb9ea35960ce9356eff891c4f0e125a3216ea36c3e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "04f3fe5b18e41354a51d94d394f030f5196619ac176e2332c5b58f16fc4ef8c0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "e9a6951414db31e6fe9792c25afee459948dd0d2866a5fc78f6faf0441a52dd1", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "be24ef39cdd4e0c681b6e30e5558549487f2db3006a396cbf1a4a71446f99b31", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "62c5de8afbf9ae1890e508e6ff4ebbef8db86ad0f25233bfb24e1e894da6860d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "543cd01fcc2f8a4bd4250bf645085f3d3663c50dcc17e1529c13794677b5accf", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "6032dad65ba48e4b92f9a2fcaaf48f375f9bdd235b8948be74df9e1befb92ecb", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "2a620944a3085c1054f57d0f07c6d19ac9d7958858607658a2961fd0c0eef409", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "f7bdd240292559badd385d400b123e051f2487e3e4dc16baf4b74b71cb4b1cef", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 93184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s4_et64x32_m64x32x32_cga1x1x1_16dp256b_TN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "72ece78605b6d5fafc6bfbbaaef349293f31d58fe16f1ae1bd548f0754b57c4e", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 174080, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x512_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "15cdfa203783589d46b84b273c10c7e0848a5fa3cf158ff0d5c39c9ac0c26bd0", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a_cubin_len, 174080, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x512u2_s2_et128x32_m128x32x32_cga1x1x1_16dp256b_TN_transOut_tokSfB_schedS_bN_ldgsts_clmp_swiGlu_dynBatch_sm100a", 256, "870c199a8a501a4b629f5d1bf97d1c8d76d06c295d76f33cf010e35cc253e99d", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 128
, /* mEpilogueTileN */ 32
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 0
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 128
, /* mMmaN */ 32
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 2
, /* mNumStagesMma */ 1
, /* mNumStagesMmaWithinWorkTile */ 1
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 32
, /* mTileK */ 512
, /* mUseUnrollLoop2xForMma */ 1
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 0
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 1
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 1
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "fe3731381664f21d8643a66720b297fdf4dd2b2e22475bb19149c23c4cb49856", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "0a5c5f1765b677f4b12a5a77bd42255d2b44f00d1f79a33656a248d7f1673b5a", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "ae778ccd730dc872964386dd1f58bf3aa7c396bfa63f5879f6f1e6723bbfb745", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_BN_transOut_noShflA_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "92968153ee4f2bbc6fee0614b1424ebd9639a9f476ba287ccea6d66931429eab", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ 128
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(2)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 0
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedP_bN_ldgsts_clmp_dynBatch_sm100a", 448, "b2efeb8d3eb03af3ddbfc5b6a02fcaebc2d6f7d966a53f11ce5b6246c05f1eac", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 4
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 2
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(1)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a_cubin_len, 115712, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut_dsFp8_schedS_bN_ldgsts_clmp_dynBatch_sm100a", 416, "9f9f0047824e413b32a53d05b28fc137716be2d3fa77718a31fc8762932530b8", { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0)
, /* mBiasType */ gemm::BiasType(0)
, /* mBlockK */ -1
, /* mClusterDimX */ 1
, /* mClusterDimY */ 1
, /* mClusterDimZ */ 1
, /* mDtypeAcc */ trtllm::gen::Dtype(1056776)
, /* mDtypeA */ trtllm::gen::Dtype(1050629)
, /* mDtypeB */ trtllm::gen::Dtype(1050629)
, /* mDtypeC */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaA */ trtllm::gen::Dtype(1050629)
, /* mDtypeMmaB */ trtllm::gen::Dtype(1050629)
, /* mEnablesEarlyExit */ 1
, /* mEnablesDelayedEarlyExit */ 0
, /* mEnablesGlobalPtxKnobs */ 1
, /* mEpilogueLdtmDps */ 16
, /* mEpilogueLdtmBits */ 256
, /* mEpilogueTileM */ 64
, /* mEpilogueTileN */ 64
, /* mGridTriggerSecondaryA */ 0
, /* mGridTriggerSecondaryB */ 1
, /* mGridWaitForPrimaryEarlyExit */ 1
, /* mGridWaitForPrimaryA */ 0
, /* mGridWaitForPrimaryB */ 1
, /* mHoistLoadTaskInit */ 1
, /* mHoistMmaTaskTryWaits */ 1
, /* mK */ 2048
, /* mKernelTraits */ {}
, /* mLayoutA */ gemm::MatrixLayout(0)
, /* mLayoutB */ gemm::MatrixLayout(0)
, /* mM */ 256
, /* mMmaK */ 32
, /* mMmaKind */ trtllm::gen::MmaKind(2)
, /* mMmaM */ 64
, /* mMmaN */ 64
, /* mMockAllReduce */ 0
, /* mN */ 256
, /* mNumSlicesForSplitK */ 1
, /* mNumSlicesForSliceK */ 1
, /* mNumStages */ 4
, /* mNumStagesMma */ 2
, /* mNumStagesMmaWithinWorkTile */ 2
, /* mNumStagesMmaAcrossWorkTile */ 1
, /* mNumStagesWorkId */ 3
, /* mOutputDebugTensors */ 0
, /* mPatchF2fp */ 0
, /* mUseShuffledMatrixA */ 1
, /* mSliceK */ 0
, /* mSplitK */ gemm::SplitK(0)
, /* mTransposeMmaOutput */ 1
, /* mTileM */ 128
, /* mTileN */ 64
, /* mTileK */ 128
, /* mUseUnrollLoop2xForMma */ 0
, /* mUseCustomMmaSchedule */ 1
, /* mUseHoistTryWaitForCustomMmaSchedule */ 0
, /* mUseDeepSeekFp8 */ 1
, /* mUsePerTokenSfA */ 0
, /* mUsePerTokenSfB */ 0
, /* mUseTmaStore */ 1
, /* mUseTwoTmaLoadWarps */ 1
, /* mUseTwoMmaWarps */ 0
, /* mSfLayoutA */ trtllm::gen::SfLayout(3)
, /* mSfLayoutB */ trtllm::gen::SfLayout(3)
, /* mSfLayoutC */ trtllm::gen::SfLayout(3)
, /* mSfReshapeFactor */ 1
, /* mTileScheduler */ gemm::TileScheduler(0)
, /* mActType */ gemmGatedAct::ActType(0)
, /* mClampBeforeAct */ 1
, /* mBatchedM */ {}
, /* mBatchedN */ {}
, /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1)
, /* mNumBatches */ 2
, /* mIsStaticBatch */ 0
, /* mNumTokens */ 2
, /* mRouteImpl */ batchedGemm::RouteImpl(1)
, /* mGridWaitForPrimaryRouting */ 1
, /* mFusedAct */ 0
, /* mNumRegsPerThreadNonEpilogueWarp */ 0
, /* mNumRegsPerThreadEpilogueWarp */ 0
, /* mNumRegsCastAWarps */ 0
, /* mUseTmaOobOpt */ 0
 }, gemm::SmVersion::Sm100a},
{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s4_et64x64_m64x64x32_cga1x1x1_16dp256b_TN_transOut